aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-05-27 10:43:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-27 10:43:44 -0700
commit105a048a4f35f7a74c7cc20b36dd83658b6ec232 (patch)
tree043b1110cda0042ba35d8aae59382bb094d0af3f
parent00b9b0af5887fed54e899e3b7f5c2ccf5e739def (diff)
parent9aeead73782c4b8e2a91def36dbf95db28605c95 (diff)
downloadlinux-2.6.34-ux500-105a048a4f35f7a74c7cc20b36dd83658b6ec232.tar.gz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (27 commits) Btrfs: add more error checking to btrfs_dirty_inode Btrfs: allow unaligned DIO Btrfs: drop verbose enospc printk Btrfs: Fix block generation verification race Btrfs: fix preallocation and nodatacow checks in O_DIRECT Btrfs: avoid ENOSPC errors in btrfs_dirty_inode Btrfs: move O_DIRECT space reservation to btrfs_direct_IO Btrfs: rework O_DIRECT enospc handling Btrfs: use async helpers for DIO write checksumming Btrfs: don't walk around with task->state != TASK_RUNNING Btrfs: do aio_write instead of write Btrfs: add basic DIO read/write support direct-io: do not merge logically non-contiguous requests direct-io: add a hook for the fs to provide its own submit_bio function fs: allow short direct-io reads to be completed via buffered IO Btrfs: Metadata ENOSPC handling for balance Btrfs: Pre-allocate space for data relocation Btrfs: Metadata ENOSPC handling for tree log Btrfs: Metadata reservation for orphan inodes Btrfs: Introduce global metadata reservation ...
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c109
-rw-r--r--fs/btrfs/ctree.h163
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c169
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2255
-rw-r--r--fs/btrfs/extent_io.c85
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c166
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1713
-rw-r--r--fs/btrfs/ioctl.c206
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1971
-rw-r--r--fs/btrfs/root-tree.c23
-rw-r--r--fs/btrfs/super.c30
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/direct-io.c62
-rw-r--r--include/linux/fs.h11
-rw-r--r--mm/filemap.c36
31 files changed, 5066 insertions, 2740 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a3014..7ec14097fef 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending)) {
spin_unlock_irq(&worker->lock);
+ set_current_state(TASK_RUNNING);
goto again;
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee19983..6ad63f17eca 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
* of extent items we've reserved metadata for.
*/
spinlock_t accounting_lock;
+ atomic_t outstanding_extents;
int reserved_extents;
- int outstanding_extents;
/*
* ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
* of these.
*/
unsigned ordered_data_close:1;
+ unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
/*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b20..0d1d966b0fe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- struct extent_buffer *cow)
+ struct extent_buffer *cow,
+ int *last_ref)
{
u64 refs;
u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
clean_tree_block(trans, root, buf);
+ *last_ref = 1;
}
return 0;
}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
int level;
+ int last_ref = 0;
int unlock_orig = 0;
u64 parent_start;
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_fsid(cow),
BTRFS_FSID_SIZE);
- update_ref_for_cow(trans, root, buf, cow);
+ update_ref_for_cow(trans, root, buf, cow, &last_ref);
+
+ if (root->ref_cows)
+ btrfs_reloc_cow_block(trans, root, buf, cow);
if (buf == root->node) {
WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
extent_buffer_get(cow);
spin_unlock(&root->node_lock);
- btrfs_free_tree_block(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid, level);
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- btrfs_free_tree_block(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid, level);
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
return bin_search(eb, key, level, slot);
}
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) + size);
+ spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) - size);
+ spin_unlock(&root->accounting_lock);
+}
+
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
* NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ goto enospc;
+ }
spin_lock(&root->node_lock);
root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
- ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
- 0, root->root_key.objectid, level);
+
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
/* once for the root ptr */
free_extent_buffer(mid);
- return ret;
+ return 0;
}
if (btrfs_header_nritems(mid) >
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- u64 bytenr = right->start;
- u32 blocksize = right->len;
-
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- free_extent_buffer(right);
- right = NULL;
wret = del_ptr(trans, root, path, level + 1, pslot +
1);
if (wret)
ret = wret;
- wret = btrfs_free_tree_block(trans, root,
- bytenr, blocksize, 0,
- root->root_key.objectid,
- level);
- if (wret)
- ret = wret;
+ root_sub_used(root, right->len);
+ btrfs_free_tree_block(trans, root, right, 0, 1);
+ free_extent_buffer(right);
+ right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- /* we've managed to empty the middle node, drop it */
- u64 bytenr = mid->start;
- u32 blocksize = mid->len;
-
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- free_extent_buffer(mid);
- mid = NULL;
wret = del_ptr(trans, root, path, level + 1, pslot);
if (wret)
ret = wret;
- wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
- 0, root->root_key.objectid, level);
- if (wret)
- ret = wret;
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
+ free_extent_buffer(mid);
+ mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_release_path(NULL, p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, blocksize, gen);
+ tmp = read_tree_block(root, blocknr, blocksize, 0);
if (tmp) {
/*
* If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
p->nodes[level + 1],
p->slots[level + 1], &b);
if (err) {
- free_extent_buffer(b);
ret = err;
goto done;
}
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (IS_ERR(c))
return PTR_ERR(c);
+ root_add_used(root, root->nodesize);
+
memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_nritems(c, 1);
btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
int nritems;
BUG_ON(!path->nodes[level]);
+ btrfs_assert_tree_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
if (IS_ERR(split))
return PTR_ERR(split);
+ root_add_used(root, root->nodesize);
+
memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_level(split, btrfs_header_level(c));
btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
+ else
+ clean_tree_block(trans, root, left);
+
btrfs_mark_buffer_dirty(right);
btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(left);
if (right_nritems)
btrfs_mark_buffer_dirty(right);
+ else
+ clean_tree_block(trans, root, right);
btrfs_item_key(right, &disk_key, 0);
wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
path->slots[0] += old_left_nritems;
- if (btrfs_header_nritems(path->nodes[0]) == 0)
- clean_tree_block(trans, root, path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
root->root_key.objectid,
&disk_key, 0, l->start, 0);
- if (IS_ERR(right)) {
- BUG_ON(1);
+ if (IS_ERR(right))
return PTR_ERR(right);
- }
+
+ root_add_used(root, root->leafsize);
memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &key, path, ins_len, 1);
- BUG_ON(ret);
+ if (ret)
+ goto err;
path->keep_locks = 0;
btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
*/
btrfs_unlock_up_safe(path, 0);
- ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
- 0, root->root_key.objectid, 0);
- return ret;
+ root_sub_used(root, leaf->len);
+
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ return 0;
}
/*
* delete the item at the leaf level in path. If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
+ btrfs_set_path_blocking(path);
+ clean_tree_block(trans, root, leaf);
ret = btrfs_del_leaf(trans, root, path, leaf);
BUG_ON(ret);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678..e9bf86415e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
struct btrfs_trans_handle;
struct btrfs_transaction;
+struct btrfs_pending_snapshot;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
+#define BTRFS_NR_RAID_TYPES 5
struct btrfs_block_group_item {
__le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
u64 flags;
u64 total_bytes; /* total bytes in the space */
- u64 bytes_used; /* total bytes used on disk */
+ u64 bytes_used; /* total bytes used,
+ this does't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
- u64 bytes_super; /* total bytes reserved for the super blocks */
- u64 bytes_root; /* the number of bytes needed to commit a
- transaction */
+
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
- u64 bytes_delalloc; /* number of bytes currently reserved for
- delayed allocation */
+ u64 disk_used; /* total bytes used on disk */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
- int force_delalloc; /* make people start doing filemap_flush until
- we're under a threshold */
struct list_head list;
- /* for controlling how we free up space for allocations */
- wait_queue_head_t allocate_wait;
- wait_queue_head_t flush_wait;
- int allocating_chunk;
- int flushing;
-
/* for block groups in our same type */
- struct list_head block_groups;
+ struct list_head block_groups[BTRFS_NR_RAID_TYPES];
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
};
+struct btrfs_block_rsv {
+ u64 size;
+ u64 reserved;
+ u64 freed[2];
+ struct btrfs_space_info *space_info;
+ struct list_head list;
+ spinlock_t lock;
+ atomic_t usage;
+ unsigned int priority:8;
+ unsigned int durable:1;
+ unsigned int refill_used:1;
+ unsigned int full:1;
+};
+
/*
* free clusters are used to claim free space in relatively large chunks,
* allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 reserved_pinned;
u64 bytes_super;
u64 flags;
u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
+ /* block reservation for extent, checksum and root tree */
+ struct btrfs_block_rsv global_block_rsv;
+ /* block reservation for delay allocation */
+ struct btrfs_block_rsv delalloc_block_rsv;
+ /* block reservation for metadata operations */
+ struct btrfs_block_rsv trans_block_rsv;
+ /* block reservation for chunk tree */
+ struct btrfs_block_rsv chunk_block_rsv;
+
+ struct btrfs_block_rsv empty_block_rsv;
+
+ /* list of block reservations that cross multiple transactions */
+ struct list_head durable_block_rsv_list;
+
+ struct mutex durable_block_rsv_mutex;
+
u64 generation;
u64 last_trans_committed;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
- struct btrfs_workers enospc_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
+ int enospc_unlink;
u64 total_pinned;
@@ -1012,6 +1035,9 @@ struct btrfs_root {
struct completion kobj_unregister;
struct mutex objectid_mutex;
+ spinlock_t accounting_lock;
+ struct btrfs_block_rsv *block_rsv;
+
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
int ref_cows;
int track_dirty;
int in_radix;
- int clean_orphans;
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t list_lock;
+ spinlock_t orphan_lock;
struct list_head orphan_list;
+ struct btrfs_block_rsv *orphan_block_rsv;
+ int orphan_item_inserted;
+ int orphan_cleanup_state;
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- u64 parent, u64 root_objectid, int level);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
- struct btrfs_block_group_cache *group);
-
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
- struct inode *inode, u64 bytes);
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int num_items, int *retries);
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+void btrfs_orphan_release_metadata(struct inode *inode);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 logical_offset, u32 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_root *root);
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e..e807b143b85 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
}
/*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags)
-{
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_head *head;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- u32 item_size;
- u64 num_refs;
- u64 extent_flags;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = num_bytes;
- delayed_refs = &trans->transaction->delayed_refs;
-again:
- ret = btrfs_search_slot(trans, root->fs_info->extent_root,
- &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- } else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
- struct btrfs_extent_item_v0 *ei0;
- BUG_ON(item_size != sizeof(*ei0));
- ei0 = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item_v0);
- num_refs = btrfs_extent_refs_v0(leaf, ei0);
- /* FIXME: this isn't correct for data */
- extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
- BUG();
-#endif
- }
- BUG_ON(num_refs == 0);
- } else {
- num_refs = 0;
- extent_flags = 0;
- ret = 0;
- }
-
- spin_lock(&delayed_refs->lock);
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
- if (ref) {
- head = btrfs_delayed_node_to_head(ref);
- if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&ref->refs);
- spin_unlock(&delayed_refs->lock);
-
- btrfs_release_path(root->fs_info->extent_root, path);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
- goto again;
- }
- if (head->extent_op && head->extent_op->update_flags)
- extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
-
- num_refs += ref->ref_mod;
- mutex_unlock(&head->mutex);
- }
- WARN_ON(num_refs == 0);
- if (refs)
- *refs = num_refs;
- if (flags)
- *flags = extent_flags;
-out:
- spin_unlock(&delayed_refs->lock);
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* helper function to update an extent delayed ref in the
* rbtree. existing and update must both have the same
* bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad3..50e3cf92fbd 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 orig_parent,
u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d0..f3b287c22ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
int rw;
int mirror_num;
unsigned long bio_flags;
+ /*
+ * bio_offset is optional, can be used if the pages in the bio
+ * can't tell us where in the file the bio should go
+ */
+ u64 bio_offset;
struct btrfs_work work;
};
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
async = container_of(work, struct async_submit_bio, work);
fs_info = BTRFS_I(async->inode)->root->fs_info;
async->submit_bio_start(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags);
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
}
static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
wake_up(&fs_info->async_submit_wait);
async->submit_bio_done(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags);
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
unsigned long bio_flags,
+ u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done)
{
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->work.flags = 0;
async->bio_flags = bio_flags;
+ async->bio_offset = bio_offset;
atomic_inc(&fs_info->nr_async_submits);
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
static int __btree_submit_bio_start(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
}
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
int ret;
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
*/
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num, 0,
+ bio_offset,
__btree_submit_bio_start,
__btree_submit_bio_done);
}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->ref_cows = 0;
root->track_dirty = 0;
root->in_radix = 0;
- root->clean_orphans = 0;
+ root->orphan_item_inserted = 0;
+ root->orphan_cleanup_state = 0;
root->fs_info = fs_info;
root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree = RB_ROOT;
+ root->block_rsv = NULL;
+ root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->node_lock);
- spin_lock_init(&root->list_lock);
+ spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->accounting_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
return 0;
}
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct extent_buffer *eb;
- struct btrfs_root *log_root_tree = fs_info->log_root_tree;
- u64 start = 0;
- u64 end = 0;
- int ret;
-
- if (!log_root_tree)
- return 0;
-
- while (1) {
- ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
- if (ret)
- break;
-
- clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
- }
- eb = fs_info->log_root_tree->node;
-
- WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) != 0);
-
- ret = btrfs_free_reserved_extent(fs_info->tree_root,
- eb->start, eb->len);
- BUG_ON(ret);
-
- free_extent_buffer(eb);
- kfree(fs_info->log_root_tree);
- fs_info->log_root_tree = NULL;
- return 0;
-}
-
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1191,19 +1172,23 @@ again:
if (root)
return root;
- ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
- if (ret == 0)
- ret = -ENOENT;
- if (ret < 0)
- return ERR_PTR(ret);
-
root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
- WARN_ON(btrfs_root_refs(&root->root_item) == 0);
set_anon_super(&root->anon_super, NULL);
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ if (ret < 0)
+ goto fail;
+ if (ret == 0)
+ root->orphan_item_inserted = 1;
+
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto fail;
@@ -1212,10 +1197,9 @@ again:
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
- if (ret == 0) {
+ if (ret == 0)
root->in_radix = 1;
- root->clean_orphans = 1;
- }
+
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
do {
- smp_mb();
- if (root->fs_info->closing)
- break;
-
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
if (freezing(current)) {
refrigerator();
} else {
- smp_mb();
- if (root->fs_info->closing)
- break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ if (!kthread_should_stop())
+ schedule();
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
struct btrfs_root *root = arg;
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
+ u64 transid;
unsigned long now;
unsigned long delay;
int ret;
do {
- smp_mb();
- if (root->fs_info->closing)
- break;
-
delay = HZ * 30;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&root->fs_info->new_trans_lock);
cur = root->fs_info->running_transaction;
if (!cur) {
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->new_trans_lock);
goto sleep;
}
now = get_seconds();
- if (now < cur->start_time || now - cur->start_time < 30) {
- mutex_unlock(&root->fs_info->trans_mutex);
+ if (!cur->blocked &&
+ (now < cur->start_time || now - cur->start_time < 30)) {
+ spin_unlock(&root->fs_info->new_trans_lock);
delay = HZ * 5;
goto sleep;
}
- mutex_unlock(&root->fs_info->trans_mutex);
- trans = btrfs_start_transaction(root, 1);
- ret = btrfs_commit_transaction(trans, root);
+ transid = cur->transid;
+ spin_unlock(&root->fs_info->new_trans_lock);
+ trans = btrfs_join_transaction(root, 1);
+ if (transid == trans->transid) {
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
if (freezing(current)) {
refrigerator();
} else {
- if (root->fs_info->closing)
- break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(delay);
+ if (!kthread_should_stop() &&
+ !btrfs_transaction_blocked(root->fs_info))
+ schedule_timeout(delay);
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv);
+ btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+ INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+ mutex_init(&fs_info->durable_block_rsv_mutex);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size),
&fs_info->generic_worker);
- btrfs_init_workers(&fs_info->enospc_workers, "enospc",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
- btrfs_start_workers(&fs_info->enospc_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
csum_root->track_dirty = 1;
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+ fs_info->data_alloc_profile = (u64)-1;
+ fs_info->metadata_alloc_profile = (u64)-1;
+ fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
ret = btrfs_read_block_groups(extent_root);
if (ret) {
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
goto fail_block_groups;
}
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
@@ -1977,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BUG_ON(ret);
if (!(sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ BUG_ON(ret);
+
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
printk(KERN_WARNING
@@ -2040,7 +2029,6 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2405,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root)
down_write(&root->fs_info->cleanup_work_sem);
up_write(&root->fs_info->cleanup_work_sem);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_join_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
/* run commit again to drop the original snapshot */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_join_transaction(root, 1);
btrfs_commit_transaction(trans, root);
ret = btrfs_write_and_wait_transaction(NULL, root);
BUG_ON(ret);
@@ -2426,15 +2414,15 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
- kthread_stop(root->fs_info->transaction_kthread);
- kthread_stop(root->fs_info->cleaner_kthread);
-
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ kthread_stop(root->fs_info->transaction_kthread);
+ kthread_stop(root->fs_info->cleaner_kthread);
+
fs_info->closing = 2;
smp_mb();
@@ -2473,7 +2461,6 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc191..88e825a0bf2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
- unsigned long bio_flags,
+ unsigned long bio_flags, u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done);
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
int btrfs_write_tree_block(struct extent_buffer *buf);
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad7..b9080d71991 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc,
- int mark_free);
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 bytenr, u64 num_bytes, int alloc);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, u64 num_bytes,
- int is_data, int reserved,
- struct extent_buffer **must_clean);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
{
- if (atomic_dec_and_test(&cache->count))
+ if (atomic_dec_and_test(&cache->count)) {
+ WARN_ON(cache->pinned > 0);
+ WARN_ON(cache->reserved > 0);
+ WARN_ON(cache->reserved_pinned > 0);
kfree(cache);
+ }
}
/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
exclude_super_stripes(extent_root, block_group);
spin_lock(&block_group->space_info->lock);
- block_group->space_info->bytes_super += block_group->bytes_super;
+ block_group->space_info->bytes_readonly += block_group->bytes_super;
spin_unlock(&block_group->space_info->lock);
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
+ flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+ BTRFS_BLOCK_GROUP_METADATA;
+
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
}
/*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u32 item_size;
+ u64 num_refs;
+ u64 extent_flags;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ if (!trans) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+again:
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free;
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ struct btrfs_extent_item_v0 *ei0;
+ BUG_ON(item_size != sizeof(*ei0));
+ ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item_v0);
+ num_refs = btrfs_extent_refs_v0(leaf, ei0);
+ /* FIXME: this isn't correct for data */
+ extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+ BUG();
+#endif
+ }
+ BUG_ON(num_refs == 0);
+ } else {
+ num_refs = 0;
+ extent_flags = 0;
+ ret = 0;
+ }
+
+ if (!trans)
+ goto out;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto again;
+ }
+ if (head->extent_op && head->extent_op->update_flags)
+ extent_flags |= head->extent_op->flags_to_set;
+ else
+ BUG_ON(num_refs == 0);
+
+ num_refs += head->node.ref_mod;
+ mutex_unlock(&head->mutex);
+ }
+ spin_unlock(&delayed_refs->lock);
+out:
+ WARN_ON(num_refs == 0);
+ if (refs)
+ *refs = num_refs;
+ if (flags)
+ *flags = extent_flags;
+out_free:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
* Back reference rules. Back refs have three main goals:
*
* 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
return ret;
}
-
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
if (insert_reserved) {
- int mark_free = 0;
- struct extent_buffer *must_clean = NULL;
-
- ret = pin_down_bytes(trans, root, NULL,
- node->bytenr, node->num_bytes,
- head->is_data, 1, &must_clean);
- if (ret > 0)
- mark_free = 1;
-
- if (must_clean) {
- clean_tree_block(NULL, root, must_clean);
- btrfs_tree_unlock(must_clean);
- free_extent_buffer(must_clean);
- }
+ btrfs_pin_extent(root, node->bytenr,
+ node->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
- if (mark_free) {
- ret = btrfs_free_reserved_extent(root,
- node->bytenr,
- node->num_bytes);
- BUG_ON(ret);
- }
}
mutex_unlock(&head->mutex);
return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
ret = 0;
out:
btrfs_free_path(path);
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ WARN_ON(ret > 0);
return ret;
}
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
+ int i;
+ int factor;
+
+ if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
found = __find_space_info(info, flags);
if (found) {
spin_lock(&found->lock);
found->total_bytes += total_bytes;
found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
found->full = 0;
spin_unlock(&found->lock);
*space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- INIT_LIST_HEAD(&found->block_groups);
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
- init_waitqueue_head(&found->flush_wait);
- init_waitqueue_head(&found->allocate_wait);
spin_lock_init(&found->lock);
- found->flags = flags;
+ found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+ BTRFS_BLOCK_GROUP_SYSTEM |
+ BTRFS_BLOCK_GROUP_METADATA);
found->total_bytes = total_bytes;
found->bytes_used = bytes_used;
+ found->disk_used = bytes_used * factor;
found->bytes_pinned = 0;
found->bytes_reserved = 0;
found->bytes_readonly = 0;
- found->bytes_delalloc = 0;
+ found->bytes_may_use = 0;
found->full = 0;
found->force_alloc = 0;
*space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (!cache->ro) {
- cache->space_info->bytes_readonly += cache->key.offset -
- btrfs_block_group_used(&cache->item);
- cache->ro = 1;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-}
-
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
return flags;
}
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
-{
- struct btrfs_fs_info *info = root->fs_info;
- u64 alloc_profile;
-
- if (data) {
- alloc_profile = info->avail_data_alloc_bits &
- info->data_alloc_profile;
- data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
- } else if (root == root->fs_info->chunk_root) {
- alloc_profile = info->avail_system_alloc_bits &
- info->system_alloc_profile;
- data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
- } else {
- alloc_profile = info->avail_metadata_alloc_bits &
- info->metadata_alloc_profile;
- data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
- }
-
- return btrfs_reduce_alloc_profile(root, data);
-}
-
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
- u64 alloc_target;
-
- alloc_target = btrfs_get_alloc_profile(root, 1);
- BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
- alloc_target);
-}
-
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
- u64 num_bytes;
- int level;
-
- level = BTRFS_MAX_LEVEL - 2;
- /*
- * NOTE: these calculations are absolutely the worst possible case.
- * This assumes that _every_ item we insert will require a new leaf, and
- * that the tree has grown to its maximum level size.
- */
-
- /*
- * for every item we insert we could insert both an extent item and a
- * extent ref item. Then for ever item we insert, we will need to cow
- * both the original leaf, plus the leaf to the left and right of it.
- *
- * Unless we are talking about the extent root, then we just want the
- * number of items * 2, since we just need the extent item plus its ref.
- */
- if (root == root->fs_info->extent_root)
- num_bytes = num_items * 2;
- else
- num_bytes = (num_items + (2 * num_items)) * 3;
-
- /*
- * num_bytes is total number of leaves we could need times the leaf
- * size, and then for every leaf we could end up cow'ing 2 nodes per
- * level, down to the leaf level.
- */
- num_bytes = (num_bytes * root->leafsize) +
- (num_bytes * (level * 2)) * root->nodesize;
-
- return num_bytes;
-}
-
-/*
- * Unreserve metadata space for delalloc. If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 alloc_target;
- bool bug = false;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- num_items);
-
- spin_lock(&meta_sinfo->lock);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- if (BTRFS_I(inode)->reserved_extents <=
- BTRFS_I(inode)->outstanding_extents) {
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- spin_unlock(&meta_sinfo->lock);
- return 0;
- }
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
- BTRFS_I(inode)->reserved_extents -= num_items;
- BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-
- if (meta_sinfo->bytes_delalloc < num_bytes) {
- bug = true;
- meta_sinfo->bytes_delalloc = 0;
- } else {
- meta_sinfo->bytes_delalloc -= num_bytes;
- }
- spin_unlock(&meta_sinfo->lock);
-
- BUG_ON(bug);
-
- return 0;
-}
-
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
- u64 thresh;
-
- thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use;
-
- thresh = meta_sinfo->total_bytes - thresh;
- thresh *= 80;
- do_div(thresh, 100);
- if (thresh <= meta_sinfo->bytes_delalloc)
- meta_sinfo->force_delalloc = 1;
- else
- meta_sinfo->force_delalloc = 0;
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= root->fs_info->avail_data_alloc_bits &
+ root->fs_info->data_alloc_profile;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= root->fs_info->avail_system_alloc_bits &
+ root->fs_info->system_alloc_profile;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= root->fs_info->avail_metadata_alloc_bits &
+ root->fs_info->metadata_alloc_profile;
+ return btrfs_reduce_alloc_profile(root, flags);
}
-struct async_flush {
- struct btrfs_root *root;
- struct btrfs_space_info *info;
- struct btrfs_work work;
-};
-
-static noinline void flush_delalloc_async(struct btrfs_work *work)
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
- struct async_flush *async;
- struct btrfs_root *root;
- struct btrfs_space_info *info;
-
- async = container_of(work, struct async_flush, work);
- root = async->root;
- info = async->info;
-
- btrfs_start_delalloc_inodes(root, 0);
- wake_up(&info->flush_wait);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
-
- kfree(async);
-}
-
-static void wait_on_flush(struct btrfs_space_info *info)
-{
- DEFINE_WAIT(wait);
- u64 used;
-
- while (1) {
- prepare_to_wait(&info->flush_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&info->lock);
- if (!info->flushing) {
- spin_unlock(&info->lock);
- break;
- }
-
- used = info->bytes_used + info->bytes_reserved +
- info->bytes_pinned + info->bytes_readonly +
- info->bytes_super + info->bytes_root +
- info->bytes_may_use + info->bytes_delalloc;
- if (used < info->total_bytes) {
- spin_unlock(&info->lock);
- break;
- }
- spin_unlock(&info->lock);
- schedule();
- }
- finish_wait(&info->flush_wait, &wait);
-}
-
-static void flush_delalloc(struct btrfs_root *root,
- struct btrfs_space_info *info)
-{
- struct async_flush *async;
- bool wait = false;
-
- spin_lock(&info->lock);
+ u64 flags;
- if (!info->flushing)
- info->flushing = 1;
+ if (data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (root == root->fs_info->chunk_root)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
- wait = true;
-
- spin_unlock(&info->lock);
-
- if (wait) {
- wait_on_flush(info);
- return;
- }
-
- async = kzalloc(sizeof(*async), GFP_NOFS);
- if (!async)
- goto flush;
-
- async->root = root;
- async->info = info;
- async->work.func = flush_delalloc_async;
+ flags = BTRFS_BLOCK_GROUP_METADATA;
- btrfs_queue_worker(&root->fs_info->enospc_workers,
- &async->work);
- wait_on_flush(info);
- return;
-
-flush:
- btrfs_start_delalloc_inodes(root, 0);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
+ return get_alloc_profile(root, flags);
}
-static int maybe_allocate_chunk(struct btrfs_root *root,
- struct btrfs_space_info *info)
-{
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
- struct btrfs_trans_handle *trans;
- bool wait = false;
- int ret = 0;
- u64 min_metadata;
- u64 free_space;
-
- free_space = btrfs_super_total_bytes(disk_super);
- /*
- * we allow the metadata to grow to a max of either 10gb or 5% of the
- * space in the volume.
- */
- min_metadata = min((u64)10 * 1024 * 1024 * 1024,
- div64_u64(free_space * 5, 100));
- if (info->total_bytes >= min_metadata) {
- spin_unlock(&info->lock);
- return 0;
- }
-
- if (info->full) {
- spin_unlock(&info->lock);
- return 0;
- }
-
- if (!info->allocating_chunk) {
- info->force_alloc = 1;
- info->allocating_chunk = 1;
- } else {
- wait = true;
- }
-
- spin_unlock(&info->lock);
-
- if (wait) {
- wait_event(info->allocate_wait,
- !info->allocating_chunk);
- return 1;
- }
-
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 4096 + 2 * 1024 * 1024,
- info->flags, 0);
- btrfs_end_transaction(trans, root);
- if (ret)
- goto out;
-out:
- spin_lock(&info->lock);
- info->allocating_chunk = 0;
- spin_unlock(&info->lock);
- wake_up(&info->allocate_wait);
-
- if (ret)
- return 0;
- return 1;
-}
-
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 used;
- u64 alloc_target;
- int flushed = 0;
- int force_delalloc;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- num_items);
-again:
- spin_lock(&meta_sinfo->lock);
-
- force_delalloc = meta_sinfo->force_delalloc;
-
- if (unlikely(!meta_sinfo->bytes_root))
- meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
- if (!flushed)
- meta_sinfo->bytes_delalloc += num_bytes;
-
- used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
- if (used > meta_sinfo->total_bytes) {
- flushed++;
-
- if (flushed == 1) {
- if (maybe_allocate_chunk(root, meta_sinfo))
- goto again;
- flushed++;
- } else {
- spin_unlock(&meta_sinfo->lock);
- }
-
- if (flushed == 2) {
- filemap_flush(inode->i_mapping);
- goto again;
- } else if (flushed == 3) {
- flush_delalloc(root, meta_sinfo);
- goto again;
- }
- spin_lock(&meta_sinfo->lock);
- meta_sinfo->bytes_delalloc -= num_bytes;
- spin_unlock(&meta_sinfo->lock);
- printk(KERN_ERR "enospc, has %d, reserved %d\n",
- BTRFS_I(inode)->outstanding_extents,
- BTRFS_I(inode)->reserved_extents);
- dump_space_info(meta_sinfo, 0, 0);
- return -ENOSPC;
- }
-
- BTRFS_I(inode)->reserved_extents += num_items;
- check_force_delalloc(meta_sinfo);
- spin_unlock(&meta_sinfo->lock);
-
- if (!flushed && force_delalloc)
- filemap_flush(inode->i_mapping);
-
- return 0;
-}
-
-/*
- * unreserve num_items number of items worth of metadata space. This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 alloc_target;
- bool bug = false;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root, num_items);
-
- spin_lock(&meta_sinfo->lock);
- if (meta_sinfo->bytes_may_use < num_bytes) {
- bug = true;
- meta_sinfo->bytes_may_use = 0;
- } else {
- meta_sinfo->bytes_may_use -= num_bytes;
- }
- spin_unlock(&meta_sinfo->lock);
-
- BUG_ON(bug);
-
- return 0;
-}
-
-/*
- * Reserve some metadata space for use. We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items. If we
- * have space, fantastic, if not, you get -ENOSPC. Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space. THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 used;
- u64 alloc_target;
- int retries = 0;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root, num_items);
-again:
- spin_lock(&meta_sinfo->lock);
-
- if (unlikely(!meta_sinfo->bytes_root))
- meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
- if (!retries)
- meta_sinfo->bytes_may_use += num_bytes;
-
- used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
- if (used > meta_sinfo->total_bytes) {
- retries++;
- if (retries == 1) {
- if (maybe_allocate_chunk(root, meta_sinfo))
- goto again;
- retries++;
- } else {
- spin_unlock(&meta_sinfo->lock);
- }
-
- if (retries == 2) {
- flush_delalloc(root, meta_sinfo);
- goto again;
- }
- spin_lock(&meta_sinfo->lock);
- meta_sinfo->bytes_may_use -= num_bytes;
- spin_unlock(&meta_sinfo->lock);
-
- dump_space_info(meta_sinfo, 0, 0);
- return -ENOSPC;
- }
-
- check_force_delalloc(meta_sinfo);
- spin_unlock(&meta_sinfo->lock);
-
- return 0;
+ BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+ BTRFS_BLOCK_GROUP_DATA);
}
/*
* This will check the space that the inode allocates from to make sure we have
* enough space for bytes.
*/
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
u64 used;
- int ret = 0, committed = 0, flushed = 0;
+ int ret = 0, committed = 0;
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
again:
/* make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
- used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
- data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
- data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
- data_sinfo->bytes_super;
+ used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+ data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+ data_sinfo->bytes_may_use;
if (used + bytes > data_sinfo->total_bytes) {
struct btrfs_trans_handle *trans;
- if (!flushed) {
- spin_unlock(&data_sinfo->lock);
- flush_delalloc(root, data_sinfo);
- flushed = 1;
- goto again;
- }
-
/*
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
spin_unlock(&data_sinfo->lock);
alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ trans = btrfs_join_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
bytes + 2 * 1024 * 1024,
alloc_target, 0);
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret < 0)
return ret;
if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
if (ret)
return ret;
goto again;
}
- printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
- ", %llu bytes_used, %llu bytes_reserved, "
- "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
- "%llu total\n", (unsigned long long)bytes,
- (unsigned long long)data_sinfo->bytes_delalloc,
+#if 0 /* I hope we never need this code again, just in case */
+ printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
+ "%llu bytes_reserved, " "%llu bytes_pinned, "
+ "%llu bytes_readonly, %llu may use %llu total\n",
+ (unsigned long long)bytes,
(unsigned long long)data_sinfo->bytes_used,
(unsigned long long)data_sinfo->bytes_reserved,
(unsigned long long)data_sinfo->bytes_pinned,
(unsigned long long)data_sinfo->bytes_readonly,
(unsigned long long)data_sinfo->bytes_may_use,
(unsigned long long)data_sinfo->total_bytes);
+#endif
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
}
/*
- * if there was an error for whatever reason after calling
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * called when we are clearing an delalloc extent from the
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
*/
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
- struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
/* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
spin_unlock(&data_sinfo->lock);
}
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
-{
- struct btrfs_space_info *data_sinfo;
-
- /* get the space info for where this inode will be storing its data */
- data_sinfo = BTRFS_I(inode)->space_info;
-
- /* make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- data_sinfo->bytes_delalloc += bytes;
-
- /*
- * we are adding a delalloc extent without calling
- * btrfs_check_data_free_space first. This happens on a weird
- * writepage condition, but shouldn't hurt our accounting
- */
- if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
- data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
- BTRFS_I(inode)->reserved_bytes = 0;
- } else {
- data_sinfo->bytes_may_use -= bytes;
- BTRFS_I(inode)->reserved_bytes -= bytes;
- }
-
- spin_unlock(&data_sinfo->lock);
-}
-
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
-{
- struct btrfs_space_info *info;
-
- info = BTRFS_I(inode)->space_info;
-
- spin_lock(&info->lock);
- info->bytes_delalloc -= bytes;
- spin_unlock(&info->lock);
-}
-
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
rcu_read_unlock();
}
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+ u64 alloc_bytes)
+{
+ u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved +
+ alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+ return 0;
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved +
+ alloc_bytes < div_factor(num_bytes, 8))
+ return 0;
+
+ return 1;
+}
+
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force)
{
struct btrfs_space_info *space_info;
struct btrfs_fs_info *fs_info = extent_root->fs_info;
- u64 thresh;
int ret = 0;
mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
- thresh = space_info->total_bytes - space_info->bytes_readonly;
- thresh = div_factor(thresh, 8);
- if (!force &&
- (space_info->bytes_used + space_info->bytes_pinned +
- space_info->bytes_reserved + alloc_bytes) < thresh) {
+ if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
spin_unlock(&space_info->lock);
goto out;
}
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
+ else
+ ret = 1;
space_info->force_alloc = 0;
spin_unlock(&space_info->lock);
out:
@@ -3461,13 +3073,713 @@ out:
return ret;
}
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+ int ret;
+ int end_trans = 0;
+
+ if (sinfo->full)
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+ spin_unlock(&sinfo->lock);
+ if (!ret)
+ return 0;
+
+ if (!trans) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ end_trans = 1;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes + 2 * 1024 * 1024,
+ get_alloc_profile(root, sinfo->flags), 0);
+
+ if (end_trans)
+ btrfs_end_transaction(trans, root);
+
+ return ret == 1 ? 1 : 0;
+}
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 to_reclaim)
+{
+ struct btrfs_block_rsv *block_rsv;
+ u64 reserved;
+ u64 max_reclaim;
+ u64 reclaimed = 0;
+ int pause = 1;
+ int ret;
+
+ block_rsv = &root->fs_info->delalloc_block_rsv;
+ spin_lock(&block_rsv->lock);
+ reserved = block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (reserved == 0)
+ return 0;
+
+ max_reclaim = min(reserved, to_reclaim);
+
+ while (1) {
+ ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+ if (!ret) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(pause);
+ pause <<= 1;
+ if (pause > HZ / 10)
+ pause = HZ / 10;
+ } else {
+ pause = 1;
+ }
+
+ spin_lock(&block_rsv->lock);
+ if (reserved > block_rsv->reserved)
+ reclaimed = reserved - block_rsv->reserved;
+ reserved = block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (reserved == 0 || reclaimed >= max_reclaim)
+ break;
+
+ if (trans && trans->transaction->blocked)
+ return -EAGAIN;
+ }
+ return reclaimed >= to_reclaim;
+}
+
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ int ret;
+
+ if ((*retries) > 2)
+ return -ENOSPC;
+
+ ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+ if (ret)
+ return 1;
+
+ if (trans && trans->transaction->in_commit)
+ return -ENOSPC;
+
+ ret = shrink_delalloc(trans, root, num_bytes);
+ if (ret)
+ return ret;
+
+ spin_lock(&space_info->lock);
+ if (space_info->bytes_pinned < num_bytes)
+ ret = 1;
+ spin_unlock(&space_info->lock);
+ if (ret)
+ return -ENOSPC;
+
+ (*retries)++;
+
+ if (trans)
+ return -EAGAIN;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+
+ return 1;
+}
+
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 unused;
+ int ret = -ENOSPC;
+
+ spin_lock(&space_info->lock);
+ unused = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly;
+
+ if (unused < space_info->total_bytes)
+ unused = space_info->total_bytes - unused;
+ else
+ unused = 0;
+
+ if (unused >= num_bytes) {
+ if (block_rsv->priority >= 10) {
+ space_info->bytes_reserved += num_bytes;
+ ret = 0;
+ } else {
+ if ((unused + block_rsv->reserved) *
+ block_rsv->priority >=
+ (num_bytes + block_rsv->reserved) * 10) {
+ space_info->bytes_reserved += num_bytes;
+ ret = 0;
+ }
+ }
+ }
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv;
+ if (root->ref_cows)
+ block_rsv = trans->block_rsv;
+ else
+ block_rsv = root->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = &root->fs_info->empty_block_rsv;
+
+ return block_rsv;
+}
+
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ int ret = -ENOSPC;
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved >= num_bytes) {
+ block_rsv->reserved -= num_bytes;
+ if (block_rsv->reserved < block_rsv->size)
+ block_rsv->full = 0;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+ return ret;
+}
+
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int update_size)
+{
+ spin_lock(&block_rsv->lock);
+ block_rsv->reserved += num_bytes;
+ if (update_size)
+ block_rsv->size += num_bytes;
+ else if (block_rsv->reserved >= block_rsv->size)
+ block_rsv->full = 1;
+ spin_unlock(&block_rsv->lock);
+}
+
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+ struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+
+ spin_lock(&block_rsv->lock);
+ if (num_bytes == (u64)-1)
+ num_bytes = block_rsv->size;
+ block_rsv->size -= num_bytes;
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ } else {
+ num_bytes = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (num_bytes > 0) {
+ if (dest) {
+ block_rsv_add_bytes(dest, num_bytes, 0);
+ } else {
+ spin_lock(&space_info->lock);
+ space_info->bytes_reserved -= num_bytes;
+ spin_unlock(&space_info->lock);
+ }
+ }
+}
+
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+ int ret;
+
+ ret = block_rsv_use_bytes(src, num_bytes);
+ if (ret)
+ return ret;
+
+ block_rsv_add_bytes(dst, num_bytes, 1);
+ return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+ memset(rsv, 0, sizeof(*rsv));
+ spin_lock_init(&rsv->lock);
+ atomic_set(&rsv->usage, 1);
+ rsv->priority = 6;
+ INIT_LIST_HEAD(&rsv->list);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 alloc_target;
+
+ block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+ if (!block_rsv)
+ return NULL;
+
+ btrfs_init_block_rsv(block_rsv);
+
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ block_rsv->space_info = __find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+
+ return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ if (rsv && atomic_dec_and_test(&rsv->usage)) {
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ if (!rsv->durable)
+ kfree(rsv);
+ }
+}
+
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv)
+{
+ block_rsv->durable = 1;
+ mutex_lock(&fs_info->durable_block_rsv_mutex);
+ list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+ mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries)
+{
+ int ret;
+
+ if (num_bytes == 0)
+ return 0;
+again:
+ ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 1);
+ return 0;
+ }
+
+ ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+ if (ret > 0)
+ goto again;
+
+ return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int min_factor)
+{
+ u64 num_bytes = 0;
+ int commit_trans = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ if (min_factor > 0)
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (min_reserved > num_bytes)
+ num_bytes = min_reserved;
+
+ if (block_rsv->reserved >= num_bytes) {
+ ret = 0;
+ } else {
+ num_bytes -= block_rsv->reserved;
+ if (block_rsv->durable &&
+ block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+ commit_trans = 1;
+ }
+ spin_unlock(&block_rsv->lock);
+ if (!ret)
+ return 0;
+
+ if (block_rsv->refill_used) {
+ ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ return 0;
+ }
+ }
+
+ if (commit_trans) {
+ if (trans)
+ return -EAGAIN;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ ret = btrfs_commit_transaction(trans, root);
+ return 0;
+ }
+
+ WARN_ON(1);
+ printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ block_rsv->size, block_rsv->reserved,
+ block_rsv->freed[0], block_rsv->freed[1]);
+
+ return -ENOSPC;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes)
+{
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ if (global_rsv->full || global_rsv == block_rsv ||
+ block_rsv->space_info != global_rsv->space_info)
+ global_rsv = NULL;
+ block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *sinfo;
+ u64 num_bytes;
+ u64 meta_used;
+ u64 data_used;
+ int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+ /*
+ * per tree used space accounting can be inaccuracy, so we
+ * can't rely on it.
+ */
+ spin_lock(&fs_info->extent_root->accounting_lock);
+ num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+ spin_unlock(&fs_info->extent_root->accounting_lock);
+
+ spin_lock(&fs_info->csum_root->accounting_lock);
+ num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+ spin_unlock(&fs_info->csum_root->accounting_lock);
+
+ spin_lock(&fs_info->tree_root->accounting_lock);
+ num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+ spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ spin_lock(&sinfo->lock);
+ data_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ spin_lock(&sinfo->lock);
+ meta_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+ csum_size * 2;
+ num_bytes += div64_u64(data_used + meta_used, 50);
+
+ if (num_bytes * 3 > meta_used)
+ num_bytes = div64_u64(meta_used, 3);
+
+ return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ struct btrfs_space_info *sinfo = block_rsv->space_info;
+ u64 num_bytes;
+
+ num_bytes = calc_global_metadata_size(fs_info);
+
+ spin_lock(&block_rsv->lock);
+ spin_lock(&sinfo->lock);
+
+ block_rsv->size = num_bytes;
+
+ num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+ sinfo->bytes_reserved + sinfo->bytes_readonly;
+
+ if (sinfo->total_bytes > num_bytes) {
+ num_bytes = sinfo->total_bytes - num_bytes;
+ block_rsv->reserved += num_bytes;
+ sinfo->bytes_reserved += num_bytes;
+ }
+
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ sinfo->bytes_reserved -= num_bytes;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ }
+#if 0
+ printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+ block_rsv->size, block_rsv->reserved);
+#endif
+ spin_unlock(&sinfo->lock);
+ spin_unlock(&block_rsv->lock);
+}
+
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ fs_info->chunk_block_rsv.space_info = space_info;
+ fs_info->chunk_block_rsv.priority = 10;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ fs_info->global_block_rsv.space_info = space_info;
+ fs_info->global_block_rsv.priority = 10;
+ fs_info->global_block_rsv.refill_used = 1;
+ fs_info->delalloc_block_rsv.space_info = space_info;
+ fs_info->trans_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.priority = 10;
+
+ fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+ btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+
+ btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+
+ update_global_block_rsv(fs_info);
+}
+
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+ WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+ WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+ WARN_ON(fs_info->trans_block_rsv.size > 0);
+ WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+ WARN_ON(fs_info->chunk_block_rsv.size > 0);
+ WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+ return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ 3 * num_items;
+}
+
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int num_items, int *retries)
+{
+ u64 num_bytes;
+ int ret;
+
+ if (num_items == 0 || root->fs_info->chunk_root == root)
+ return 0;
+
+ num_bytes = calc_trans_metadata_size(root, num_items);
+ ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+ num_bytes, retries);
+ if (!ret) {
+ trans->bytes_reserved += num_bytes;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ }
+ return ret;
+}
+
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!trans->bytes_reserved)
+ return;
+
+ BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+ btrfs_block_rsv_release(root, trans->block_rsv,
+ trans->bytes_reserved);
+ trans->bytes_reserved = 0;
+}
+
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+
+ /*
+ * one for deleting orphan item, one for updating inode and
+ * two for calling btrfs_truncate_inode_items.
+ *
+ * btrfs_truncate_inode_items is a delete operation, it frees
+ * more space than it uses in most cases. So two units of
+ * metadata space should be enough for calling it many times.
+ * If all of the metadata space is used, we can commit
+ * transaction and use space it freed.
+ */
+ u64 num_bytes = calc_trans_metadata_size(root, 4);
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 num_bytes = calc_trans_metadata_size(root, 4);
+ btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+ /*
+ * two for root back/forward refs, two for directory entries
+ * and one for root of the snapshot.
+ */
+ u64 num_bytes = calc_trans_metadata_size(root, 5);
+ dst_rsv->space_info = src_rsv->space_info;
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+ return num_bytes >>= 3;
+}
+
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+ u64 to_reserve;
+ int nr_extents;
+ int retries = 0;
+ int ret;
+
+ if (btrfs_transaction_in_commit(root->fs_info))
+ schedule_timeout(1);
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+ if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+ nr_extents -= BTRFS_I(inode)->reserved_extents;
+ to_reserve = calc_trans_metadata_size(root, nr_extents);
+ } else {
+ nr_extents = 0;
+ to_reserve = 0;
+ }
+
+ to_reserve += calc_csum_metadata_size(inode, num_bytes);
+ ret = reserve_metadata_bytes(block_rsv, to_reserve);
+ if (ret) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+ &retries);
+ if (ret > 0)
+ goto again;
+ return ret;
+ }
+
+ BTRFS_I(inode)->reserved_extents += nr_extents;
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ block_rsv_add_bytes(block_rsv, to_reserve, 1);
+
+ if (block_rsv->size > 512 * 1024 * 1024)
+ shrink_delalloc(NULL, root, to_reserve);
+
+ return 0;
+}
+
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 to_free;
+ int nr_extents;
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+ if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+ nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+ BTRFS_I(inode)->reserved_extents -= nr_extents;
+ } else {
+ nr_extents = 0;
+ }
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ to_free = calc_csum_metadata_size(inode, num_bytes);
+ if (nr_extents > 0)
+ to_free += calc_trans_metadata_size(root, nr_extents);
+
+ btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+ to_free);
+}
+
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+ int ret;
+
+ ret = btrfs_check_data_free_space(inode, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+ if (ret) {
+ btrfs_free_reserved_data_space(inode, num_bytes);
+ return ret;
+ }
+
+ return 0;
+}
+
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+ btrfs_delalloc_release_metadata(inode, num_bytes);
+ btrfs_free_reserved_data_space(inode, num_bytes);
+}
+
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc,
- int mark_free)
+ u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *info = root->fs_info;
+ int factor;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache)
return -1;
+ if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
old_val += num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
cache->reserved -= num_bytes;
- cache->space_info->bytes_used += num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
- if (cache->ro)
- cache->space_info->bytes_readonly -= num_bytes;
+ cache->space_info->bytes_used += num_bytes;
+ cache->space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
- cache->space_info->bytes_used -= num_bytes;
- if (cache->ro)
- cache->space_info->bytes_readonly += num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ cache->space_info->disk_used -= num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- if (mark_free) {
- int ret;
- ret = btrfs_discard_extent(root, bytenr,
- num_bytes);
- WARN_ON(ret);
-
- ret = btrfs_add_free_space(cache, bytenr,
- num_bytes);
- WARN_ON(ret);
- }
+ set_extent_dirty(info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
}
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
-/*
- * this function must be called within transaction
- */
-int btrfs_pin_extent(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int reserved)
+static int pin_down_extent(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache,
+ u64 bytenr, u64 num_bytes, int reserved)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_group_cache *cache;
-
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
-
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- btrfs_put_block_group(cache);
+ set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ BUG_ON(!cache);
+
+ pin_down_extent(root, cache, bytenr, num_bytes, reserved);
- set_extent_dirty(fs_info->pinned_extents,
- bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ btrfs_put_block_group(cache);
return 0;
}
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+/*
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo)
{
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- cache->reserved += num_bytes;
- cache->space_info->bytes_reserved += num_bytes;
+ int ret = 0;
+ if (sinfo) {
+ struct btrfs_space_info *space_info = cache->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ }
+ } else {
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
} else {
- cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
+ spin_lock(&cache->lock);
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ if (reserve)
+ cache->reserved += num_bytes;
+ else
+ cache->reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
}
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- return 0;
+ return ret;
}
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
fs_info->pinned_extents = &fs_info->freed_extents[0];
up_write(&fs_info->extent_commit_sem);
+
+ update_global_block_rsv(fs_info);
return 0;
}
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
btrfs_add_free_space(cache, start, len);
}
+ start += len;
+
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
cache->space_info->bytes_pinned -= len;
+ if (cache->ro) {
+ cache->space_info->bytes_readonly += len;
+ } else if (cache->reserved_pinned > 0) {
+ len = min(len, cache->reserved_pinned);
+ cache->reserved_pinned -= len;
+ cache->space_info->bytes_reserved += len;
+ }
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
-
- start += len;
}
if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *unpin;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *next_rsv;
u64 start;
u64 end;
+ int idx;
int ret;
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
- return ret;
-}
+ mutex_lock(&fs_info->durable_block_rsv_mutex);
+ list_for_each_entry_safe(block_rsv, next_rsv,
+ &fs_info->durable_block_rsv_list, list) {
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, u64 num_bytes,
- int is_data, int reserved,
- struct extent_buffer **must_clean)
-{
- int err = 0;
- struct extent_buffer *buf;
-
- if (is_data)
- goto pinit;
-
- /*
- * discard is sloooow, and so triggering discards on
- * individual btree blocks isn't a good plan. Just
- * pin everything in discard mode.
- */
- if (btrfs_test_opt(root, DISCARD))
- goto pinit;
-
- buf = btrfs_find_tree_block(root, bytenr, num_bytes);
- if (!buf)
- goto pinit;
+ idx = trans->transid & 0x1;
+ if (block_rsv->freed[idx] > 0) {
+ block_rsv_add_bytes(block_rsv,
+ block_rsv->freed[idx], 0);
+ block_rsv->freed[idx] = 0;
+ }
+ if (atomic_read(&block_rsv->usage) == 0) {
+ btrfs_block_rsv_release(root, block_rsv, (u64)-1);
- /* we can reuse a block if it hasn't been written
- * and it is from this transaction. We can't
- * reuse anything from the tree log root because
- * it has tiny sub-transactions.
- */
- if (btrfs_buffer_uptodate(buf, 0) &&
- btrfs_try_tree_lock(buf)) {
- u64 header_owner = btrfs_header_owner(buf);
- u64 header_transid = btrfs_header_generation(buf);
- if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
- header_transid == trans->transid &&
- !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- *must_clean = buf;
- return 1;
+ if (block_rsv->freed[0] == 0 &&
+ block_rsv->freed[1] == 0) {
+ list_del_init(&block_rsv->list);
+ kfree(block_rsv);
+ }
+ } else {
+ btrfs_block_rsv_release(root, block_rsv, 0);
}
- btrfs_tree_unlock(buf);
}
- free_extent_buffer(buf);
-pinit:
- if (path)
- btrfs_set_path_blocking(path);
- /* unlocks the pinned mutex */
- btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+ mutex_unlock(&fs_info->durable_block_rsv_mutex);
- BUG_ON(err < 0);
return 0;
}
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
} else {
- int mark_free = 0;
- struct extent_buffer *must_clean = NULL;
-
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = pin_down_bytes(trans, root, path, bytenr,
- num_bytes, is_data, 0, &must_clean);
- if (ret > 0)
- mark_free = 1;
- BUG_ON(ret < 0);
- /*
- * it is going to be very rare for someone to be waiting
- * on the block we're freeing. del_items might need to
- * schedule, so rather than get fancy, just force it
- * to blocking here
- */
- if (must_clean)
- btrfs_set_lock_blocking(must_clean);
-
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
- if (must_clean) {
- clean_tree_block(NULL, root, must_clean);
- btrfs_tree_unlock(must_clean);
- free_extent_buffer(must_clean);
- }
-
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
}
- ret = update_block_group(trans, root, bytenr, num_bytes, 0,
- mark_free);
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0);
BUG_ON(ret);
}
btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
/*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
* delayed ref for that extent as well. This searches the delayed ref tree for
* a given extent, and if there are no other delayed refs to be processed, it
* removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct rb_node *node;
- int ret;
+ int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
list_del_init(&head->cluster);
spin_unlock(&delayed_refs->lock);
- ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
- &head->node, head->extent_op,
- head->must_insert_reserved);
- BUG_ON(ret);
+ BUG_ON(head->extent_op);
+ if (head->must_insert_reserved)
+ ret = 1;
+
+ mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
- return 0;
+ return ret;
out:
spin_unlock(&delayed_refs->lock);
return 0;
}
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_group_cache *cache = NULL;
+ int ret;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+ parent, root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL);
+ BUG_ON(ret);
+ }
+
+ if (!last_ref)
+ return;
+
+ block_rsv = get_block_rsv(trans, root);
+ cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+ BUG_ON(block_rsv->space_info != cache->space_info);
+
+ if (btrfs_header_generation(buf) == trans->transid) {
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, root, buf->start);
+ if (!ret)
+ goto pin;
+ }
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(root, cache, buf->start, buf->len, 1);
+ goto pin;
+ }
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(cache, buf->start, buf->len);
+ ret = update_reserved_bytes(cache, buf->len, 0, 0);
+ if (ret == -EAGAIN) {
+ /* block group became read-only */
+ update_reserved_bytes(cache, buf->len, 0, 1);
+ goto out;
+ }
+
+ ret = 1;
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ block_rsv->reserved += buf->len;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (ret) {
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_reserved -= buf->len;
+ spin_unlock(&cache->space_info->lock);
+ }
+ goto out;
+ }
+pin:
+ if (block_rsv->durable && !cache->ro) {
+ ret = 0;
+ spin_lock(&cache->lock);
+ if (!cache->ro) {
+ cache->reserved_pinned += buf->len;
+ ret = 1;
+ }
+ spin_unlock(&cache->lock);
+
+ if (ret) {
+ spin_lock(&block_rsv->lock);
+ block_rsv->freed[trans->transid & 0x1] += buf->len;
+ spin_unlock(&block_rsv->lock);
+ }
+ }
+out:
+ btrfs_put_block_group(cache);
+}
+
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, (int)owner,
BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret);
- ret = check_ref_cleanup(trans, root, bytenr);
- BUG_ON(ret);
} else {
ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
parent, root_objectid, owner,
@@ -4067,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- u64 parent, u64 root_objectid, int level)
-{
- u64 used;
- spin_lock(&root->node_lock);
- used = btrfs_root_used(&root->root_item) - blocksize;
- btrfs_set_root_used(&root->root_item, used);
- spin_unlock(&root->node_lock);
-
- return btrfs_free_extent(trans, root, bytenr, blocksize,
- parent, root_objectid, level, 0);
-}
-
static u64 stripe_align(struct btrfs_root *root, u64 val)
{
u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+ int index;
+ if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+ index = 0;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+ index = 1;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+ index = 2;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+ index = 3;
+ else
+ index = 4;
+ return index;
+}
+
enum btrfs_loop_type {
LOOP_FIND_IDEAL = 0,
LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 empty_size,
u64 search_start, u64 search_end,
u64 hint_byte, struct btrfs_key *ins,
- u64 exclude_start, u64 exclude_nr,
int data)
{
int ret = 0;
@@ -4168,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
+ int index = 0;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
bool failed_alloc = false;
@@ -4237,6 +4620,7 @@ ideal_cache:
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
+ index = get_block_group_index(block_group);
goto have_block_group;
}
} else if (block_group) {
@@ -4245,7 +4629,8 @@ ideal_cache:
}
search:
down_read(&space_info->groups_sem);
- list_for_each_entry(block_group, &space_info->block_groups, list) {
+ list_for_each_entry(block_group, &space_info->block_groups[index],
+ list) {
u64 offset;
int cached;
@@ -4436,23 +4821,22 @@ checks:
goto loop;
}
- if (exclude_nr > 0 &&
- (search_start + num_bytes > exclude_start &&
- search_start < exclude_start + exclude_nr)) {
- search_start = exclude_start + exclude_nr;
+ ins->objectid = search_start;
+ ins->offset = num_bytes;
+
+ if (offset < search_start)
+ btrfs_add_free_space(block_group, offset,
+ search_start - offset);
+ BUG_ON(offset > search_start);
+ ret = update_reserved_bytes(block_group, num_bytes, 1,
+ (data & BTRFS_BLOCK_GROUP_DATA));
+ if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
- /*
- * if search_start is still in this block group
- * then we just re-search this block group
- */
- if (search_start >= block_group->key.objectid &&
- search_start < (block_group->key.objectid +
- block_group->key.offset))
- goto have_block_group;
goto loop;
}
+ /* we are all good, lets return */
ins->objectid = search_start;
ins->offset = num_bytes;
@@ -4460,18 +4844,18 @@ checks:
btrfs_add_free_space(block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
-
- update_reserved_extents(block_group, num_bytes, 1);
-
- /* we are all good, lets return */
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
+ BUG_ON(index != get_block_group_index(block_group));
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
+ if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+ goto search;
+
/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
* for them to make caching progress. Also
* determine the best possible bg to cache
@@ -4485,6 +4869,7 @@ loop:
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
(found_uncached_bg || empty_size || empty_cluster ||
allowed_chunk_alloc)) {
+ index = 0;
if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
found_uncached_bg = false;
loop++;
@@ -4567,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
+ int index = 0;
spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
info->bytes_pinned - info->bytes_reserved -
- info->bytes_super),
+ info->bytes_readonly),
(info->full) ? "" : "not ");
- printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
- "\n",
+ printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+ "reserved=%llu, may_use=%llu, readonly=%llu\n",
(unsigned long long)info->total_bytes,
+ (unsigned long long)info->bytes_used,
(unsigned long long)info->bytes_pinned,
- (unsigned long long)info->bytes_delalloc,
+ (unsigned long long)info->bytes_reserved,
(unsigned long long)info->bytes_may_use,
- (unsigned long long)info->bytes_used,
- (unsigned long long)info->bytes_root,
- (unsigned long long)info->bytes_super,
- (unsigned long long)info->bytes_reserved);
+ (unsigned long long)info->bytes_readonly);
spin_unlock(&info->lock);
if (!dump_block_groups)
return;
down_read(&info->groups_sem);
- list_for_each_entry(cache, &info->block_groups, list) {
+again:
+ list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
"%llu pinned %llu reserved\n",
@@ -4603,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
}
+ if (++index < BTRFS_NR_RAID_TYPES)
+ goto again;
up_read(&info->groups_sem);
}
@@ -4628,9 +5014,8 @@ again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
- search_start, search_end, hint_byte, ins,
- trans->alloc_exclude_start,
- trans->alloc_exclude_nr, data);
+ search_start, search_end, hint_byte,
+ ins, data);
if (ret == -ENOSPC && num_bytes > min_alloc_size) {
num_bytes = num_bytes >> 1;
@@ -4668,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
- update_reserved_extents(cache, len, 0);
+ update_reserved_bytes(cache, len, 0, 1);
btrfs_put_block_group(cache);
return ret;
@@ -4731,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 1, 0);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) {
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 1, 0);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) {
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
put_caching_control(caching_ctl);
}
- update_reserved_extents(block_group, ins->offset, 1);
+ ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+ BUG_ON(ret);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
return ret;
}
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 parent, u64 root_objectid,
- struct btrfs_disk_key *key, int level,
- u64 empty_size, u64 hint_byte, u64 search_end,
- struct btrfs_key *ins)
-{
- int ret;
- u64 flags = 0;
-
- ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
- empty_size, hint_byte, search_end,
- ins, 0);
- if (ret)
- return ret;
-
- if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
- if (parent == 0)
- parent = ins->objectid;
- flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
- } else
- BUG_ON(parent > 0);
-
- if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_delayed_extent_op *extent_op;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
- if (key)
- memcpy(&extent_op->key, key, sizeof(extent_op->key));
- else
- memset(&extent_op->key, 0, sizeof(extent_op->key));
- extent_op->flags_to_set = flags;
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
-
- ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
- ins->offset, parent, root_objectid,
- level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
- BUG_ON(ret);
- }
-
- if (root_objectid == root->root_key.objectid) {
- u64 used;
- spin_lock(&root->node_lock);
- used = btrfs_root_used(&root->root_item) + num_bytes;
- btrfs_set_root_used(&root->root_item, used);
- spin_unlock(&root->node_lock);
- }
- return ret;
-}
-
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -4974,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
return buf;
}
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u32 blocksize)
+{
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ block_rsv = get_block_rsv(trans, root);
+
+ if (block_rsv->size == 0) {
+ ret = reserve_metadata_bytes(block_rsv, blocksize);
+ if (ret)
+ return ERR_PTR(ret);
+ return block_rsv;
+ }
+
+ ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
+ return block_rsv;
+
+ WARN_ON(1);
+ printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ block_rsv->size, block_rsv->reserved,
+ block_rsv->freed[0], block_rsv->freed[1]);
+
+ return ERR_PTR(-ENOSPC);
+}
+
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+ block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_release_bytes(block_rsv, NULL, 0);
+}
+
/*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
* returns the tree buffer or NULL.
*/
struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 hint, u64 empty_size)
{
struct btrfs_key ins;
- int ret;
+ struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
+ u64 flags = 0;
+ int ret;
+
- ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
- key, level, empty_size, hint, (u64)-1, &ins);
+ block_rsv = use_block_rsv(trans, root, blocksize);
+ if (IS_ERR(block_rsv))
+ return ERR_CAST(block_rsv);
+
+ ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+ empty_size, hint, (u64)-1, &ins, 0);
if (ret) {
- BUG_ON(ret > 0);
+ unuse_block_rsv(block_rsv, blocksize);
return ERR_PTR(ret);
}
buf = btrfs_init_new_buffer(trans, root, ins.objectid,
blocksize, level);
+ BUG_ON(IS_ERR(buf));
+
+ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent == 0)
+ parent = ins.objectid;
+ flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else
+ BUG_ON(parent > 0);
+
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ BUG_ON(!extent_op);
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = 1;
+ extent_op->update_flags = 1;
+ extent_op->is_data = 0;
+
+ ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+ ins.offset, parent, root_objectid,
+ level, BTRFS_ADD_DELAYED_EXTENT,
+ extent_op);
+ BUG_ON(ret);
+ }
return buf;
}
@@ -5321,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc)
{
- int ret = 0;
+ int ret;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
@@ -5399,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_owner(path->nodes[level + 1]));
}
- ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
- root->root_key.objectid, level, 0);
- BUG_ON(ret);
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
- return ret;
+ return 0;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*/
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref)
{
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5501,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
wc = kzalloc(sizeof(*wc), GFP_NOFS);
BUG_ON(!wc);
- trans = btrfs_start_transaction(tree_root, 1);
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
@@ -5589,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
}
BUG_ON(wc->level == 0);
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing) {
+ if (btrfs_should_end_transaction(trans, tree_root)) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
BUG_ON(ret);
- btrfs_end_transaction(trans, tree_root);
- trans = btrfs_start_transaction(tree_root, 1);
- } else {
- unsigned long update;
- update = trans->delayed_ref_updates;
- trans->delayed_ref_updates = 0;
- if (update)
- btrfs_run_delayed_refs(trans, tree_root,
- update);
+ btrfs_end_transaction_throttle(trans, tree_root);
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
}
}
btrfs_release_path(root, path);
@@ -5632,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
kfree(root);
}
out:
- btrfs_end_transaction(trans, tree_root);
+ btrfs_end_transaction_throttle(trans, tree_root);
kfree(wc);
btrfs_free_path(path);
return err;
@@ -7228,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
- struct btrfs_block_group_cache *shrink_block_group,
- int force)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
{
- struct btrfs_trans_handle *trans;
- u64 new_alloc_flags;
- u64 calc;
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+ int ret = -ENOSPC;
- spin_lock(&shrink_block_group->lock);
- if (btrfs_block_group_used(&shrink_block_group->item) +
- shrink_block_group->reserved > 0) {
- spin_unlock(&shrink_block_group->lock);
+ if (cache->ro)
+ return 0;
- trans = btrfs_start_transaction(root, 1);
- spin_lock(&shrink_block_group->lock);
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+ sinfo->bytes_may_use + sinfo->bytes_readonly +
+ cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+ sinfo->bytes_readonly += num_bytes;
+ sinfo->bytes_reserved += cache->reserved_pinned;
+ cache->reserved_pinned = 0;
+ cache->ro = 1;
+ ret = 0;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+ return ret;
+}
- new_alloc_flags = update_block_group_flags(root,
- shrink_block_group->flags);
- if (new_alloc_flags != shrink_block_group->flags) {
- calc =
- btrfs_block_group_used(&shrink_block_group->item);
- } else {
- calc = shrink_block_group->key.offset;
- }
- spin_unlock(&shrink_block_group->lock);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
- do_chunk_alloc(trans, root->fs_info->extent_root,
- calc + 2 * 1024 * 1024, new_alloc_flags, force);
+{
+ struct btrfs_trans_handle *trans;
+ u64 alloc_flags;
+ int ret;
- btrfs_end_transaction(trans, root);
- } else
- spin_unlock(&shrink_block_group->lock);
- return 0;
-}
+ BUG_ON(cache->ro);
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ if (alloc_flags != cache->flags)
+ do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
- struct btrfs_block_group_cache *group)
+ ret = set_block_group_ro(cache);
+ if (!ret)
+ goto out;
+ alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+ ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+ if (ret < 0)
+ goto out;
+ ret = set_block_group_ro(cache);
+out:
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
{
- __alloc_chunk_for_shrink(root, group, 1);
- set_block_group_readonly(group);
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+
+ BUG_ON(!cache->ro);
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ cache->ro = 0;
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
return 0;
}
@@ -7436,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
*/
synchronize_rcu();
+ release_global_block_rsv(info);
+
while(!list_empty(&info->space_info)) {
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
-
+ if (space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0) {
+ WARN_ON(1);
+ dump_space_info(space_info, 0, 0);
+ }
list_del(&space_info->list);
kfree(space_info);
}
return 0;
}
+static void __link_block_group(struct btrfs_space_info *space_info,
+ struct btrfs_block_group_cache *cache)
+{
+ int index = get_block_group_index(cache);
+
+ down_write(&space_info->groups_sem);
+ list_add_tail(&cache->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
+}
+
int btrfs_read_block_groups(struct btrfs_root *root)
{
struct btrfs_path *path;
@@ -7468,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
while (1) {
ret = find_first_block_group(root, path, &key);
- if (ret > 0) {
- ret = 0;
- goto error;
- }
+ if (ret > 0)
+ break;
if (ret != 0)
goto error;
@@ -7480,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache) {
ret = -ENOMEM;
- break;
+ goto error;
}
atomic_set(&cache->count, 1);
@@ -7537,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
BUG_ON(ret);
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_super += cache->bytes_super;
+ cache->space_info->bytes_readonly += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
- down_write(&space_info->groups_sem);
- list_add_tail(&cache->list, &space_info->block_groups);
- up_write(&space_info->groups_sem);
+ __link_block_group(space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret);
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
- set_block_group_readonly(cache);
+ set_block_group_ro(cache);
}
+
+ list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+ if (!(get_alloc_profile(root, space_info->flags) &
+ (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)))
+ continue;
+ /*
+ * avoid allocating from un-mirrored block group if there are
+ * mirrored block groups.
+ */
+ list_for_each_entry(cache, &space_info->block_groups[3], list)
+ set_block_group_ro(cache);
+ list_for_each_entry(cache, &space_info->block_groups[4], list)
+ set_block_group_ro(cache);
+ }
+
+ init_global_block_rsv(info);
ret = 0;
error:
btrfs_free_path(path);
@@ -7611,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
BUG_ON(ret);
spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_super += cache->bytes_super;
+ cache->space_info->bytes_readonly += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
- down_write(&cache->space_info->groups_sem);
- list_add_tail(&cache->list, &cache->space_info->block_groups);
- up_write(&cache->space_info->groups_sem);
+ __link_block_group(cache->space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab..a4080c21ec5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
return state;
}
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
{
if (!state)
return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
}
static int set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned long bits)
+ struct extent_state *state, int *bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
return tree->ops->set_bit_hook(tree->mapping->host,
- state->start, state->end,
- state->state, bits);
+ state, bits);
}
return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned long bits)
+ struct extent_state *state, int *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
- int bits)
+ int *bits)
{
struct rb_node *node;
+ int bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
if (ret)
return ret;
- if (bits & EXTENT_DIRTY)
+ if (bits_to_set & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
- state->state |= bits;
+ state->state |= bits_to_set;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
* struct is freed and removed from the tree
*/
static int clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state, int bits, int wake,
- int delete)
+ struct extent_state *state,
+ int *bits, int wake)
{
- int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret = state->state & bits_to_clear;
- if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
WARN_ON(range > tree->dirty_bytes);
tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
- if (delete || state->state == 0) {
+ if (state->state == 0) {
if (state->tree) {
- clear_state_cb(tree, state, state->state);
rb_erase(&state->rb_node, &tree->state);
state->tree = NULL;
free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int set = 0;
int clear = 0;
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+ bits |= EXTENT_FIRST_DELALLOC;
+
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
@@ -580,8 +581,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, bits, wake,
- delete);
+ set |= clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+ set |= clear_state_bit(tree, prealloc, &bits, wake);
prealloc = NULL;
goto out;
@@ -613,7 +613,7 @@ hit_next:
else
next_node = NULL;
- set |= clear_state_bit(tree, state, bits, wake, delete);
+ set |= clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -706,19 +706,19 @@ out:
static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- int bits)
+ int *bits)
{
int ret;
+ int bits_to_set = *bits & ~EXTENT_CTLBITS;
ret = set_state_cb(tree, state, bits);
if (ret)
return ret;
-
- if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- state->state |= bits;
+ state->state |= bits_to_set;
return 0;
}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
* [start, end] is inclusive This takes the tree lock.
*/
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state,
- gfp_t mask)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
*/
node = tree_search(tree, start);
if (!node) {
- err = insert_state(tree, prealloc, start, end, bits);
+ err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
BUG_ON(err == -EEXIST);
goto out;
@@ -802,7 +802,7 @@ hit_next:
goto out;
}
- err = set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, &bits);
if (err)
goto out;
@@ -852,7 +852,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- err = set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, &bits);
if (err)
goto out;
cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
else
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
- bits);
+ &bits);
BUG_ON(err == -EEXIST);
if (err) {
prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- err = set_state_bits(tree, prealloc, bits);
+ err = set_state_bits(tree, prealloc, &bits);
if (err) {
prealloc = NULL;
goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0,
- NULL, mask);
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
}
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
- if (op & EXTENT_CLEAR_ACCOUNTING)
- clear_bits |= EXTENT_DO_ACCOUNTING;
-
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
if (tree->ops && tree->ops->submit_bio_hook)
tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
- mirror_num, bio_flags);
+ mirror_num, bio_flags, start);
else
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
sector_t sector;
struct extent_map *em;
struct block_device *bdev;
+ struct btrfs_ordered_extent *ordered;
int ret;
int nr = 0;
size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
set_page_extent_mapped(page);
end = page_end;
- lock_extent(tree, start, end, GFP_NOFS);
+ while (1) {
+ lock_extent(tree, start, end, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(inode, start);
+ if (!ordered)
+ break;
+ unlock_extent(tree, start, end, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646..5691c7b590d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/* flags for bio submission */
#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags);
+ unsigned long bio_flags, u64 bio_offset);
struct extent_io_ops {
int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
struct extent_state *state);
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
- int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits);
+ int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+ int *bits);
int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long bits);
+ int *bits);
int (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
u64 max_bytes, unsigned long bits);
+void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int filled, struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa..a562a250ae7 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
}
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u32 *dst)
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+ struct inode *inode, struct bio *bio,
+ u64 logical_offset, u32 *dst, int dio)
{
u32 sum;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
- u64 offset;
+ u64 offset = 0;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
WARN_ON(bio->bi_vcnt <= 0);
disk_bytenr = (u64)bio->bi_sector << 9;
+ if (dio)
+ offset = logical_offset;
while (bio_index < bio->bi_vcnt) {
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ if (!dio)
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
if (ret == 0)
goto found;
@@ -238,6 +242,7 @@ found:
else
set_state_private(io_tree, offset, sum);
disk_bytenr += bvec->bv_len;
+ offset += bvec->bv_len;
bio_index++;
bvec++;
}
@@ -245,6 +250,18 @@ found:
return 0;
}
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 offset, u32 *dst)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list)
{
@@ -657,6 +674,9 @@ again:
goto found;
}
ret = PTR_ERR(item);
+ if (ret != -EFBIG && ret != -ENOENT)
+ goto fail_unlock;
+
if (ret == -EFBIG) {
u32 item_size;
/* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4c..79437c5eeb1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
int write_bytes,
struct page **prepared_pages,
- const char __user *buf)
+ struct iov_iter *i)
{
- long page_fault = 0;
- int i;
+ size_t copied;
+ int pg = 0;
int offset = pos & (PAGE_CACHE_SIZE - 1);
- for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+ while (write_bytes > 0) {
size_t count = min_t(size_t,
PAGE_CACHE_SIZE - offset, write_bytes);
- struct page *page = prepared_pages[i];
- fault_in_pages_readable(buf, count);
+ struct page *page = prepared_pages[pg];
+again:
+ if (unlikely(iov_iter_fault_in_readable(i, count)))
+ return -EFAULT;
/* Copy data from userspace to the current page */
- kmap(page);
- page_fault = __copy_from_user(page_address(page) + offset,
- buf, count);
+ copied = iov_iter_copy_from_user(page, i, offset, count);
+
/* Flush processor's dcache for this page */
flush_dcache_page(page);
- kunmap(page);
- buf += count;
- write_bytes -= count;
+ iov_iter_advance(i, copied);
+ write_bytes -= copied;
- if (page_fault)
- break;
+ if (unlikely(copied == 0)) {
+ count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+
+ if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+ offset += copied;
+ } else {
+ pg++;
+ offset = 0;
+ }
}
- return page_fault ? -EFAULT : 0;
+ return 0;
}
/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
NULL);
- if (err)
- return err;
+ BUG_ON(err);
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
* at this time.
*/
}
- return err;
+ return 0;
}
/*
@@ -823,45 +832,46 @@ again:
return 0;
}
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- loff_t pos;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page *pinned[2];
+ struct page **pages = NULL;
+ struct iov_iter i;
+ loff_t *ppos = &iocb->ki_pos;
loff_t start_pos;
ssize_t num_written = 0;
ssize_t err = 0;
+ size_t count;
+ size_t ocount;
int ret = 0;
- struct inode *inode = fdentry(file)->d_inode;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct page **pages = NULL;
int nrptrs;
- struct page *pinned[2];
unsigned long first_index;
unsigned long last_index;
int will_write;
+ int buffered = 0;
will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
(file->f_flags & O_DIRECT));
- nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
- PAGE_CACHE_SIZE / (sizeof(struct page *)));
pinned[0] = NULL;
pinned[1] = NULL;
- pos = *ppos;
start_pos = pos;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
- /* do the reserve before the mutex lock in case we have to do some
- * flushing. We wouldn't deadlock, but this is more polite.
- */
- err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (err)
- goto out_nolock;
-
mutex_lock(&inode->i_mutex);
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err)
+ goto out;
+ count = ocount;
+
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
goto out;
file_update_time(file);
+ BTRFS_I(inode)->sequence++;
+
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+ pos, ppos, count,
+ ocount);
+ /*
+ * the generic O_DIRECT will update in-memory i_size after the
+ * DIOs are done. But our endio handlers that update the on
+ * disk i_size never update past the in memory i_size. So we
+ * need one more update here to catch any additions to the
+ * file
+ */
+ if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ mark_inode_dirty(inode);
+ }
+ if (num_written < 0) {
+ ret = num_written;
+ num_written = 0;
+ goto out;
+ } else if (num_written == count) {
+ /* pick up pos changes done by the generic code */
+ pos = *ppos;
+ goto out;
+ }
+ /*
+ * We are going to do buffered for the rest of the range, so we
+ * need to make sure to invalidate the buffered pages when we're
+ * done.
+ */
+ buffered = 1;
+ pos += num_written;
+ }
+
+ iov_iter_init(&i, iov, nr_segs, count, num_written);
+ nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+ (sizeof(struct page *)));
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
/* generic_write_checks can change our pos */
start_pos = pos;
- BTRFS_I(inode)->sequence++;
first_index = pos >> PAGE_CACHE_SHIFT;
- last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+ last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
/*
* there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
unlock_page(pinned[0]);
}
}
- if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+ if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
pinned[1] = grab_cache_page(inode->i_mapping, last_index);
if (!PageUptodate(pinned[1])) {
ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
}
}
- while (count > 0) {
+ while (iov_iter_count(&i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
- size_t write_bytes = min(count, nrptrs *
- (size_t)PAGE_CACHE_SIZE -
+ size_t write_bytes = min(iov_iter_count(&i),
+ nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
WARN_ON(num_pages > nrptrs);
memset(pages, 0, sizeof(struct page *) * nrptrs);
- ret = btrfs_check_data_free_space(root, inode, write_bytes);
+ ret = btrfs_delalloc_reserve_space(inode, write_bytes);
if (ret)
goto out;
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
pos, first_index, last_index,
write_bytes);
if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
}
ret = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, buf);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
- btrfs_drop_pages(pages, num_pages);
- goto out;
+ write_bytes, pages, &i);
+ if (ret == 0) {
+ dirty_and_release_pages(NULL, root, file, pages,
+ num_pages, pos, write_bytes);
}
- ret = dirty_and_release_pages(NULL, root, file, pages,
- num_pages, pos, write_bytes);
btrfs_drop_pages(pages, num_pages);
if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
}
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
btrfs_throttle(root);
}
- buf += write_bytes;
- count -= write_bytes;
pos += write_bytes;
num_written += write_bytes;
@@ -976,9 +1016,7 @@ out:
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-out_nolock:
kfree(pages);
if (pinned[0])
page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
num_written = err;
if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
btrfs_end_transaction(trans, root);
}
}
- if (file->f_flags & O_DIRECT) {
+ if (file->f_flags & O_DIRECT && buffered) {
invalidate_mapping_pages(inode->i_mapping,
start_pos >> PAGE_CACHE_SHIFT,
(start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1104,9 +1142,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (file && file->private_data)
btrfs_ioctl_trans_end(file);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto out;
}
@@ -1161,7 +1199,7 @@ const struct file_operations btrfs_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.splice_read = generic_file_splice_read,
- .write = btrfs_file_write,
+ .aio_write = btrfs_file_aio_write,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6..64f1150bb48 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
return 0;
}
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod)
+{
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ int ret;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+ if (!find_name_in_backref(path, name, name_len, &ref))
+ return NULL;
+ return ref;
+}
+
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d..fa6ccc1bfe2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len, compressed_size,
compressed_pages);
BUG_ON(ret);
+ btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
}
@@ -414,6 +415,7 @@ again:
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
/* lets try to make an inline extent */
if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
start, end, NULL,
EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_ACCOUNTING |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
return 0;
}
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+ u64 num_bytes)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ u64 alloc_hint = 0;
+
+ read_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, start, num_bytes);
+ if (em) {
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ }
+ read_unlock(&em_tree->lock);
+
+ return alloc_hint;
+}
+
/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
actual_end = min_t(u64, isize, end + 1);
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_ACCOUNTING |
EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK |
EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(disk_num_bytes >
btrfs_super_total_bytes(&root->fs_info->super_copy));
-
- read_lock(&BTRFS_I(inode)->extent_tree.lock);
- em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
- start, num_bytes);
- if (em) {
- /*
- * if block start isn't an actual block number then find the
- * first block in this inode and use that as a hint. If that
- * block is also bogus then just don't worry about it.
- */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- em = search_extent_mapping(em_tree, 0, 0);
- if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
- if (em)
- free_extent_map(em);
- } else {
- alloc_hint = em->block_start;
- free_extent_map(em);
- }
- }
- read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
num_bytes, num_bytes, type);
BUG_ON(ret);
+ if (root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_reloc_clone_csums(inode, cur_offset,
+ num_bytes);
+ BUG_ON(ret);
+ }
+
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
cur_offset, cur_offset + num_bytes - 1,
locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
}
static int btrfs_split_extent_hook(struct inode *inode,
- struct extent_state *orig, u64 split)
+ struct extent_state *orig, u64 split)
{
+ /* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return 0;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
return 0;
}
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
if (!(other->state & EXTENT_DELALLOC))
return 0;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
return 0;
}
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
* bytes in this file, and to maintain the list of inodes that
* have pending delalloc work to be done.
*/
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits)
+static int btrfs_set_bit_hook(struct inode *inode,
+ struct extent_state *state, int *bits)
{
/*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ if (*bits & EXTENT_FIRST_DELALLOC)
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ else
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
spin_lock(&root->fs_info->delalloc_lock);
- BTRFS_I(inode)->delalloc_bytes += end - start + 1;
- root->fs_info->delalloc_bytes += end - start + 1;
+ BTRFS_I(inode)->delalloc_bytes += len;
+ root->fs_info->delalloc_bytes += len;
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
&root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
static int btrfs_clear_bit_hook(struct inode *inode,
- struct extent_state *state, unsigned long bits)
+ struct extent_state *state, int *bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
- if (bits & EXTENT_DO_ACCOUNTING) {
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- WARN_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
- }
+ if (*bits & EXTENT_FIRST_DELALLOC)
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ else if (!(*bits & EXTENT_DO_ACCOUNTING))
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+ if (*bits & EXTENT_DO_ACCOUNTING)
+ btrfs_delalloc_release_metadata(inode, len);
+
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ btrfs_free_reserved_data_space(inode, len);
spin_lock(&root->fs_info->delalloc_lock);
- if (state->end - state->start + 1 >
- root->fs_info->delalloc_bytes) {
- printk(KERN_INFO "btrfs warning: delalloc account "
- "%llu %llu\n",
- (unsigned long long)
- state->end - state->start + 1,
- (unsigned long long)
- root->fs_info->delalloc_bytes);
- btrfs_delalloc_free_space(root, inode, (u64)-1);
- root->fs_info->delalloc_bytes = 0;
- BTRFS_I(inode)->delalloc_bytes = 0;
- } else {
- btrfs_delalloc_free_space(root, inode,
- state->end -
- state->start + 1);
- root->fs_info->delalloc_bytes -= state->end -
- state->start + 1;
- BTRFS_I(inode)->delalloc_bytes -= state->end -
- state->start + 1;
- }
+ root->fs_info->delalloc_bytes -= len;
+ BTRFS_I(inode)->delalloc_bytes -= len;
+
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
*/
static int __btrfs_submit_bio_start(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
* are inserted into the btree
*/
static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
* on write, or reading the csums from the tree before a read
*/
static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
/* we're doing a write, do the async checksumming */
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num,
- bio_flags, __btrfs_submit_bio_start,
+ bio_flags, bio_offset,
+ __btrfs_submit_bio_start,
__btrfs_submit_bio_done);
}
@@ -1520,6 +1525,7 @@ again:
goto again;
}
+ BUG();
btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
ClearPageChecked(page);
out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret) {
trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
}
goto out;
}
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
0, &cached_state, GFP_NOFS);
trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
- /* this also removes the ordered extent from the tree */
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
out:
+ btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+ if (trans)
+ btrfs_end_transaction(trans, root);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
failrec->last_mirror,
- failrec->bio_flags);
+ failrec->bio_flags, 0);
return 0;
}
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
}
/*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root;
+ struct btrfs_block_rsv *block_rsv;
+ u64 num_bytes;
+ int index;
+
+ root = pending->root;
+ if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ return;
+
+ block_rsv = root->orphan_block_rsv;
+
+ /* orphan block reservation for the snapshot */
+ num_bytes = block_rsv->size;
+
+ /*
+ * after the snapshot is created, COWing tree blocks may use more
+ * space than it frees. So we should make sure there is enough
+ * reserved space.
+ */
+ index = trans->transid & 0x1;
+ if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ num_bytes += block_rsv->size -
+ (block_rsv->reserved + block_rsv->freed[index]);
+ }
+
+ *bytes_to_reserve += num_bytes;
+}
+
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *snap = pending->snap;
+ struct btrfs_block_rsv *block_rsv;
+ u64 num_bytes;
+ int index;
+ int ret;
+
+ if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ return;
+
+ /* refill source subvolume's orphan block reservation */
+ block_rsv = root->orphan_block_rsv;
+ index = trans->transid & 0x1;
+ if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ num_bytes = block_rsv->size -
+ (block_rsv->reserved + block_rsv->freed[index]);
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ root->orphan_block_rsv,
+ num_bytes);
+ BUG_ON(ret);
+ }
+
+ /* setup orphan block reservation for the snapshot */
+ block_rsv = btrfs_alloc_block_rsv(snap);
+ BUG_ON(!block_rsv);
+
+ btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+ snap->orphan_block_rsv = block_rsv;
+
+ num_bytes = root->orphan_block_rsv->size;
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ block_rsv, num_bytes);
+ BUG_ON(ret);
+
+#if 0
+ /* insert orphan item for the snapshot */
+ WARN_ON(!root->orphan_item_inserted);
+ ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ snap->root_key.objectid);
+ BUG_ON(ret);
+ snap->orphan_item_inserted = 1;
+#endif
+}
+
+enum btrfs_orphan_cleanup_state {
+ ORPHAN_CLEANUP_STARTED = 1,
+ ORPHAN_CLEANUP_DONE = 2,
+};
+
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ if (!list_empty(&root->orphan_list) ||
+ root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+ return;
+
+ if (root->orphan_item_inserted &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ root->orphan_item_inserted = 0;
+ }
+
+ if (root->orphan_block_rsv) {
+ WARN_ON(root->orphan_block_rsv->size > 0);
+ btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ root->orphan_block_rsv = NULL;
+ }
+}
+
+/*
* This creates an orphan entry for the given inode in case something goes
* wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ * this function.
*/
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
+ struct btrfs_block_rsv *block_rsv = NULL;
+ int reserve = 0;
+ int insert = 0;
+ int ret;
- spin_lock(&root->list_lock);
+ if (!root->orphan_block_rsv) {
+ block_rsv = btrfs_alloc_block_rsv(root);
+ BUG_ON(!block_rsv);
+ }
- /* already on the orphan list, we're good */
- if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- spin_unlock(&root->list_lock);
- return 0;
+ spin_lock(&root->orphan_lock);
+ if (!root->orphan_block_rsv) {
+ root->orphan_block_rsv = block_rsv;
+ } else if (block_rsv) {
+ btrfs_free_block_rsv(root, block_rsv);
+ block_rsv = NULL;
+ }
+
+ if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+ /*
+ * For proper ENOSPC handling, we should do orphan
+ * cleanup when mounting. But this introduces backward
+ * compatibility issue.
+ */
+ if (!xchg(&root->orphan_item_inserted, 1))
+ insert = 2;
+ else
+ insert = 1;
+#endif
+ insert = 1;
+ } else {
+ WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
}
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ if (!BTRFS_I(inode)->orphan_meta_reserved) {
+ BTRFS_I(inode)->orphan_meta_reserved = 1;
+ reserve = 1;
+ }
+ spin_unlock(&root->orphan_lock);
- spin_unlock(&root->list_lock);
+ if (block_rsv)
+ btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
- /*
- * insert an orphan item to track this unlinked/truncated file
- */
- ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ /* grab metadata reservation from transaction handle */
+ if (reserve) {
+ ret = btrfs_orphan_reserve_metadata(trans, inode);
+ BUG_ON(ret);
+ }
- return ret;
+ /* insert an orphan item to track this unlinked/truncated file */
+ if (insert >= 1) {
+ ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ BUG_ON(ret);
+ }
+
+ /* insert an orphan item to track subvolume contains orphan files */
+ if (insert >= 2) {
+ ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ }
+ return 0;
}
/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ int delete_item = 0;
+ int release_rsv = 0;
int ret = 0;
- spin_lock(&root->list_lock);
-
- if (list_empty(&BTRFS_I(inode)->i_orphan)) {
- spin_unlock(&root->list_lock);
- return 0;
+ spin_lock(&root->orphan_lock);
+ if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ list_del_init(&BTRFS_I(inode)->i_orphan);
+ delete_item = 1;
}
- list_del_init(&BTRFS_I(inode)->i_orphan);
- if (!trans) {
- spin_unlock(&root->list_lock);
- return 0;
+ if (BTRFS_I(inode)->orphan_meta_reserved) {
+ BTRFS_I(inode)->orphan_meta_reserved = 0;
+ release_rsv = 1;
}
+ spin_unlock(&root->orphan_lock);
- spin_unlock(&root->list_lock);
+ if (trans && delete_item) {
+ ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ BUG_ON(ret);
+ }
- ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ if (release_rsv)
+ btrfs_orphan_release_metadata(inode);
- return ret;
+ return 0;
}
/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
struct inode *inode;
int ret = 0, nr_unlink = 0, nr_truncate = 0;
- if (!xchg(&root->clean_orphans, 0))
+ if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
return;
path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- if (IS_ERR(inode))
- break;
+ BUG_ON(IS_ERR(inode));
/*
* add this inode to the orphan list so btrfs_orphan_del does
* the proper thing when we hit it
*/
- spin_lock(&root->list_lock);
+ spin_lock(&root->orphan_lock);
list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- spin_unlock(&root->list_lock);
+ spin_unlock(&root->orphan_lock);
/*
* if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
* do a destroy_inode
*/
if (is_bad_inode(inode)) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
btrfs_orphan_del(trans, inode);
btrfs_end_transaction(trans, root);
iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
/* this will do delete_inode and everything for us */
iput(inode);
}
+ btrfs_free_path(path);
+
+ root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+ if (root->orphan_block_rsv)
+ btrfs_block_rsv_release(root, root->orphan_block_rsv,
+ (u64)-1);
+
+ if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ trans = btrfs_join_transaction(root, 1);
+ btrfs_end_transaction(trans, root);
+ }
if (nr_unlink)
printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
if (nr_truncate)
printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-
- btrfs_free_path(path);
}
/*
@@ -2478,29 +2666,201 @@ out:
return ret;
}
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ struct extent_buffer *eb;
+ int level;
+ int ret;
+ u64 refs;
+
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level])
+ break;
+ eb = path->nodes[level];
+ if (!btrfs_block_can_be_shared(root, eb))
+ continue;
+ ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+ &refs, NULL);
+ if (refs > 1)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+ struct dentry *dentry)
{
- struct btrfs_root *root;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_dir_item *di;
struct inode *inode = dentry->d_inode;
+ u64 index;
+ int check_link = 1;
+ int err = -ENOSPC;
int ret;
- unsigned long nr = 0;
- root = BTRFS_I(dir)->root;
+ trans = btrfs_start_transaction(root, 10);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
- /*
- * 5 items for unlink inode
- * 1 for orphan
- */
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- return ret;
+ if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return ERR_PTR(-ENOSPC);
+
+ /* check if there is someone else holds reference */
+ if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+ return ERR_PTR(-ENOSPC);
+
+ if (atomic_read(&inode->i_count) > 2)
+ return ERR_PTR(-ENOSPC);
+
+ if (xchg(&root->fs_info->enospc_unlink, 1))
+ return ERR_PTR(-ENOSPC);
- trans = btrfs_start_transaction(root, 1);
+ path = btrfs_alloc_path();
+ if (!path) {
+ root->fs_info->enospc_unlink = 0;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- btrfs_unreserve_metadata_space(root, 6);
- return PTR_ERR(trans);
+ btrfs_free_path(path);
+ root->fs_info->enospc_unlink = 0;
+ return trans;
+ }
+
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(dir)->location, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret == 0) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ check_link = 0;
+ }
+ btrfs_release_path(root, path);
+
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(inode)->location, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret == 0) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ check_link = 0;
+ }
+ btrfs_release_path(root, path);
+
+ if (ret == 0 && S_ISREG(inode->i_mode)) {
+ ret = btrfs_lookup_file_extent(trans, root, path,
+ inode->i_ino, (u64)-1, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ BUG_ON(ret == 0);
+ if (check_path_shared(root, path))
+ goto out;
+ btrfs_release_path(root, path);
+ }
+
+ if (!check_link) {
+ err = 0;
+ goto out;
+ }
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ dentry->d_name.name, dentry->d_name.len, 0);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto out;
+ }
+ if (di) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ err = 0;
+ goto out;
}
+ btrfs_release_path(root, path);
+
+ ref = btrfs_lookup_inode_ref(trans, root, path,
+ dentry->d_name.name, dentry->d_name.len,
+ inode->i_ino, dir->i_ino, 0);
+ if (IS_ERR(ref)) {
+ err = PTR_ERR(ref);
+ goto out;
+ }
+ BUG_ON(!ref);
+ if (check_path_shared(root, path))
+ goto out;
+ index = btrfs_inode_ref_index(path->nodes[0], ref);
+ btrfs_release_path(root, path);
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+ dentry->d_name.name, dentry->d_name.len, 0);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto out;
+ }
+ BUG_ON(ret == -ENOENT);
+ if (check_path_shared(root, path))
+ goto out;
+
+ err = 0;
+out:
+ btrfs_free_path(path);
+ if (err) {
+ btrfs_end_transaction(trans, root);
+ root->fs_info->enospc_unlink = 0;
+ return ERR_PTR(err);
+ }
+
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+ return trans;
+}
+
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ BUG_ON(!root->fs_info->enospc_unlink);
+ root->fs_info->enospc_unlink = 0;
+ }
+ btrfs_end_transaction_throttle(trans, root);
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode = dentry->d_inode;
+ int ret;
+ unsigned long nr = 0;
+
+ trans = __unlink_start_trans(dir, dentry);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, dir);
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
+ BUG_ON(ret);
- if (inode->i_nlink == 0)
+ if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, inode);
+ BUG_ON(ret);
+ }
nr = trans->blocks_used;
-
- btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 6);
+ __unlink_end_trans(trans, root);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
int err = 0;
- int ret;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
- ret = btrfs_reserve_metadata_space(root, 5);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- btrfs_unreserve_metadata_space(root, 5);
+ trans = __unlink_start_trans(dir, dentry);
+ if (IS_ERR(trans))
return PTR_ERR(trans);
- }
btrfs_set_trans_block_group(trans, dir);
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
btrfs_i_size_write(inode, 0);
out:
nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 5);
+ __unlink_end_trans(trans, root);
btrfs_btree_balance_dirty(root, nr);
- if (ret && !err)
- err = ret;
return err;
}
@@ -3029,6 +3380,7 @@ out:
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
+ BUG_ON(ret);
}
btrfs_free_path(path);
return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
if ((offset & (blocksize - 1)) == 0)
goto out;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret)
- goto out;
-
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret)
goto out;
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
again:
page = grab_cache_page(mapping, index);
if (!page) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto out;
}
@@ -3132,8 +3479,7 @@ again:
out_unlock:
if (ret)
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em;
+ struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
u64 mask = root->sectorsize - 1;
u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
u64 hint_byte = 0;
hole_size = last_byte - cur_offset;
- err = btrfs_reserve_metadata_space(root, 2);
- if (err)
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
break;
-
- trans = btrfs_start_transaction(root, 1);
+ }
btrfs_set_trans_block_group(trans, inode);
err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
last_byte - 1, 0);
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 2);
}
free_extent_map(em);
+ em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
+ free_extent_map(em);
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
GFP_NOFS);
return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
}
}
- ret = btrfs_reserve_metadata_space(root, 1);
- if (ret)
- return ret;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 1);
btrfs_btree_balance_dirty(root, nr);
if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
i_size_write(inode, attr->ia_size);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
+ BUG_ON(!trans->block_rsv);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
btrfs_i_size_write(inode, 0);
while (1) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
- ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ trans->block_rsv = root->orphan_block_rsv;
+
+ ret = btrfs_block_rsv_check(trans, root,
+ root->orphan_block_rsv, 0, 5);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ continue;
+ }
+ ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
if (ret != -EAGAIN)
break;
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
trans = NULL;
btrfs_btree_balance_dirty(root, nr);
+
}
if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
return 0;
}
-static noinline void init_btrfs_i(struct inode *inode)
-{
- struct btrfs_inode *bi = BTRFS_I(inode);
-
- bi->generation = 0;
- bi->sequence = 0;
- bi->last_trans = 0;
- bi->last_sub_trans = 0;
- bi->logged_trans = 0;
- bi->delalloc_bytes = 0;
- bi->reserved_bytes = 0;
- bi->disk_i_size = 0;
- bi->flags = 0;
- bi->index_cnt = (u64)-1;
- bi->last_unlink_trans = 0;
- bi->ordered_data_close = 0;
- bi->force_compress = 0;
- extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
- extent_io_tree_init(&BTRFS_I(inode)->io_tree,
- inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
- inode->i_mapping, GFP_NOFS);
- INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
- INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
- btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
- mutex_init(&BTRFS_I(inode)->log_mutex);
-}
-
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
inode->i_ino = args->ino;
- init_btrfs_i(inode);
BTRFS_I(inode)->root = args->root;
btrfs_set_inode_space_info(args->root, inode);
return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
if (!inode)
return ERR_PTR(-ENOMEM);
- init_btrfs_i(inode);
-
BTRFS_I(inode)->root = root;
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
struct btrfs_trans_handle *trans;
int ret = 0;
- if (root->fs_info->btree_inode == inode)
+ if (BTRFS_I(inode)->dummy_inode)
return 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (BTRFS_I(inode)->dummy_inode)
+ return;
trans = btrfs_join_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
- btrfs_update_inode(trans, root, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret == -ENOSPC) {
+ /* whoops, lets try again with the full transaction */
+ btrfs_end_transaction(trans, root);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %ld\n",
+ inode->i_ino, PTR_ERR(trans));
+ }
+ return;
+ }
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %d\n",
+ inode->i_ino, ret);
+ }
+ }
+ }
btrfs_end_transaction(trans, root);
}
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* btrfs_get_inode_index_count has an explanation for the magic
* number
*/
- init_btrfs_i(inode);
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
+
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-fail:
- btrfs_unreserve_metadata_space(root, 5);
+ btrfs_btree_balance_dirty(root, nr);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
return err;
}
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
- int err;
int drop_inode = 0;
+ int err;
unsigned long nr = 0;
u64 objectid;
u64 index = 0;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-fail:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (root->objectid != BTRFS_I(inode)->root->objectid)
return -EPERM;
- /*
- * 1 item for inode ref
- * 2 items for dir items
- */
- err = btrfs_reserve_metadata_space(root, 3);
- if (err)
- return err;
-
btrfs_inc_nlink(inode);
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto fail;
+ }
btrfs_set_trans_block_group(trans, dir);
atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
- btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
+
/*
* 2 items for inode and ref
* 2 items for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
-
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- goto out_unlock;
- }
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_fail;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
out_fail:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-
-out_unlock:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
}
flush_dcache_page(page);
} else if (create && PageUptodate(page)) {
+ WARN_ON(1);
if (!trans) {
kunmap(page);
free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
return em;
}
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+
+ trans = btrfs_join_transaction(root, 0);
+ if (!trans)
+ return ERR_PTR(-ENOMEM);
+
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ alloc_hint = get_extent_allocation_hint(inode, start, len);
+ ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+ alloc_hint, (u64)-1, &ins, 1);
+ if (ret) {
+ em = ERR_PTR(ret);
+ goto out;
+ }
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ em->start = start;
+ em->orig_start = em->start;
+ em->len = ins.offset;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+ }
+
+ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+ ins.offset, ins.offset, 0);
+ if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ em = ERR_PTR(ret);
+ }
+out:
+ btrfs_end_transaction(trans, root);
+ return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 len)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ u64 backref_offset;
+ u64 extent_end;
+ u64 num_bytes;
+ int slot;
+ int found_type;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ offset, 0);
+ if (ret < 0)
+ goto out;
+
+ slot = path->slots[0];
+ if (ret == 1) {
+ if (slot == 0) {
+ /* can't find the item, must cow */
+ ret = 0;
+ goto out;
+ }
+ slot--;
+ }
+ ret = 0;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != inode->i_ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* not our file or wrong item type, must cow */
+ goto out;
+ }
+
+ if (key.offset > offset) {
+ /* Wrong offset, must cow */
+ goto out;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (found_type != BTRFS_FILE_EXTENT_REG &&
+ found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+ /* not a regular extent, must cow */
+ goto out;
+ }
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end < offset + len) {
+ /* extent doesn't include our full range, must cow */
+ goto out;
+ }
+
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out;
+
+ /*
+ * look for other files referencing this extent, if we
+ * find any we must cow
+ */
+ if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ key.offset - backref_offset, disk_bytenr))
+ goto out;
+
+ /*
+ * adjust disk_bytenr and num_bytes to cover just the bytes
+ * in this extent we are about to write. If there
+ * are any csums in that range we have to cow in order
+ * to keep the csums correct
+ */
+ disk_bytenr += backref_offset;
+ disk_bytenr += offset - key.offset;
+ num_bytes = min(offset + len, extent_end) - offset;
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out;
+ /*
+ * all of the above have passed, it is safe to overwrite this extent
+ * without cow
+ */
+ ret = 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 start = iblock << inode->i_blkbits;
+ u64 len = bh_result->b_size;
+ struct btrfs_trans_handle *trans;
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because thats what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ em->block_start == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ return -ENOTBLK;
+ }
+
+ /* Just a good old fashioned hole, return */
+ if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ free_extent_map(em);
+ /* DIO will do one hole at a time, so just unlock a sector */
+ unlock_extent(&BTRFS_I(inode)->io_tree, start,
+ start + root->sectorsize - 1, GFP_NOFS);
+ return 0;
+ }
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if (!create) {
+ len = em->len - (start - em->start);
+ goto map;
+ }
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+ int type;
+ int ret;
+ u64 block_start;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = em->block_start + (start - em->start);
+
+ /*
+ * we're not going to log anything, but we do need
+ * to make sure the current transaction stays open
+ * while we look for nocow cross refs
+ */
+ trans = btrfs_join_transaction(root, 0);
+ if (!trans)
+ goto must_cow;
+
+ if (can_nocow_odirect(trans, inode, start, len) == 1) {
+ ret = btrfs_add_ordered_extent_dio(inode, start,
+ block_start, len, len, type);
+ btrfs_end_transaction(trans, root);
+ if (ret) {
+ free_extent_map(em);
+ return ret;
+ }
+ goto unlock;
+ }
+ btrfs_end_transaction(trans, root);
+ }
+must_cow:
+ /*
+ * this will cow the extent, reset the len in case we changed
+ * it above
+ */
+ len = bh_result->b_size;
+ free_extent_map(em);
+ em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ len = min(len, em->len - (start - em->start));
+unlock:
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+ 0, NULL, GFP_NOFS);
+map:
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+ if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ free_extent_map(em);
+
+ return 0;
+}
+
+struct btrfs_dio_private {
+ struct inode *inode;
+ u64 logical_offset;
+ u64 disk_bytenr;
+ u64 bytes;
+ u32 *csums;
+ void *private;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+ struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 start;
+ u32 *private = dip->csums;
+
+ start = dip->logical_offset;
+ do {
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ struct page *page = bvec->bv_page;
+ char *kaddr;
+ u32 csum = ~(u32)0;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kaddr = kmap_atomic(page, KM_IRQ0);
+ csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+ csum, bvec->bv_len);
+ btrfs_csum_final(csum, (char *)&csum);
+ kunmap_atomic(kaddr, KM_IRQ0);
+ local_irq_restore(flags);
+
+ flush_dcache_page(bvec->bv_page);
+ if (csum != *private) {
+ printk(KERN_ERR "btrfs csum failed ino %lu off"
+ " %llu csum %u private %u\n",
+ inode->i_ino, (unsigned long long)start,
+ csum, *private);
+ err = -EIO;
+ }
+ }
+
+ start += bvec->bv_len;
+ private++;
+ bvec++;
+ } while (bvec <= bvec_end);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+ dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+ bio->bi_private = dip->private;
+
+ kfree(dip->csums);
+ kfree(dip);
+ dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_ordered_extent *ordered = NULL;
+ struct extent_state *cached_state = NULL;
+ int ret;
+
+ if (err)
+ goto out_done;
+
+ ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+ dip->logical_offset, dip->bytes);
+ if (!ret)
+ goto out_done;
+
+ BUG_ON(!ordered);
+
+ trans = btrfs_join_transaction(root, 1);
+ if (!trans) {
+ err = -ENOMEM;
+ goto out;
+ }
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, inode);
+ err = ret;
+ goto out;
+ }
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ ordered->file_offset + ordered->len - 1, 0,
+ &cached_state, GFP_NOFS);
+
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+ ret = btrfs_mark_extent_written(trans, inode,
+ ordered->file_offset,
+ ordered->file_offset +
+ ordered->len);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+ } else {
+ ret = insert_reserved_file_extent(trans, inode,
+ ordered->file_offset,
+ ordered->start,
+ ordered->disk_len,
+ ordered->len,
+ ordered->len,
+ 0, 0, 0,
+ BTRFS_FILE_EXTENT_REG);
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered->file_offset, ordered->len);
+ if (ret) {
+ err = ret;
+ WARN_ON(1);
+ goto out_unlock;
+ }
+ }
+
+ add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+ btrfs_ordered_update_i_size(inode, 0, ordered);
+ btrfs_update_inode(trans, root, inode);
+out_unlock:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ ordered->file_offset + ordered->len - 1,
+ &cached_state, GFP_NOFS);
+out:
+ btrfs_delalloc_release_metadata(inode, ordered->len);
+ btrfs_end_transaction(trans, root);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+out_done:
+ bio->bi_private = dip->private;
+
+ kfree(dip->csums);
+ kfree(dip);
+ dio_end_io(bio, err);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags, u64 offset)
+{
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+ BUG_ON(ret);
+ return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+ loff_t file_offset)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_dio_private *dip;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ u64 start;
+ int skip_sum;
+ int write = rw & (1 << BIO_RW);
+ int ret = 0;
+
+ skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ dip = kmalloc(sizeof(*dip), GFP_NOFS);
+ if (!dip) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+ dip->csums = NULL;
+
+ if (!skip_sum) {
+ dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+ if (!dip->csums) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+ }
+
+ dip->private = bio->bi_private;
+ dip->inode = inode;
+ dip->logical_offset = file_offset;
+
+ start = dip->logical_offset;
+ dip->bytes = 0;
+ do {
+ dip->bytes += bvec->bv_len;
+ bvec++;
+ } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+ dip->disk_bytenr = (u64)bio->bi_sector << 9;
+ bio->bi_private = dip;
+
+ if (write)
+ bio->bi_end_io = btrfs_endio_direct_write;
+ else
+ bio->bi_end_io = btrfs_endio_direct_read;
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ if (ret)
+ goto out_err;
+
+ if (write && !skip_sum) {
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, 0, 0,
+ dip->logical_offset,
+ __btrfs_submit_bio_start_direct_io,
+ __btrfs_submit_bio_done);
+ if (ret)
+ goto out_err;
+ return;
+ } else if (!skip_sum)
+ btrfs_lookup_bio_sums_dio(root, inode, bio,
+ dip->logical_offset, dip->csums);
+
+ ret = btrfs_map_bio(root, rw, bio, 0, 1);
+ if (ret)
+ goto out_err;
+ return;
+out_err:
+ kfree(dip->csums);
+ kfree(dip);
+free_ordered:
+ /*
+ * If this is a write, we need to clean up the reserved space and kill
+ * the ordered extent.
+ */
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+ ordered = btrfs_lookup_ordered_extent(inode,
+ dip->logical_offset);
+ if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ btrfs_free_reserved_extent(root, ordered->start,
+ ordered->disk_len);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ bio_endio(bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ int seg;
+ size_t size;
+ unsigned long addr;
+ unsigned blocksize_mask = root->sectorsize - 1;
+ ssize_t retval = -EINVAL;
+ loff_t end = offset;
+
+ if (offset & blocksize_mask)
+ goto out;
+
+ /* Check the memory alignment. Blocks cannot straddle pages */
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = (unsigned long)iov[seg].iov_base;
+ size = iov[seg].iov_len;
+ end += size;
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
+ goto out;
+ }
+ retval = 0;
+out:
+ return retval;
+}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
- return -EINVAL;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ u64 lockstart, lockend;
+ ssize_t ret;
+ int writing = rw & WRITE;
+ int write_bits = 0;
+ size_t count = iov_length(iov, nr_segs);
+
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+ offset, nr_segs)) {
+ return 0;
+ }
+
+ lockstart = offset;
+ lockend = offset + count - 1;
+
+ if (writing) {
+ ret = btrfs_delalloc_reserve_space(inode, count);
+ if (ret)
+ goto out;
+ }
+
+ while (1) {
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state, GFP_NOFS);
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure theres no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered)
+ break;
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ cond_resched();
+ }
+
+ /*
+ * we don't use btrfs_set_extent_delalloc because we don't want
+ * the dirty or uptodate bits
+ */
+ if (writing) {
+ write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DELALLOC, 0, NULL, &cached_state,
+ GFP_NOFS);
+ if (ret) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_LOCKED | write_bits,
+ 1, 0, &cached_state, GFP_NOFS);
+ goto out;
+ }
+ }
+
+ free_extent_state(cached_state);
+ cached_state = NULL;
+
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, 0);
+
+ if (ret < 0 && ret != -EIOCBQUEUED) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+ /*
+ * We're falling back to buffered, unlock the section we didn't
+ * do IO on.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ }
+out:
+ free_extent_state(cached_state);
+ return ret;
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_start;
u64 page_end;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret) {
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
@@ -5059,7 +6021,6 @@ again:
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
/* page got truncated out from underneath us */
goto out_unlock;
}
@@ -5100,7 +6061,6 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state, GFP_NOFS);
ret = VM_FAULT_SIGBUS;
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
goto out_unlock;
}
ret = 0;
@@ -5127,10 +6087,10 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
out_unlock:
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
out:
return ret;
}
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
/*
* setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
btrfs_add_ordered_operation(trans, root, inode);
while (1) {
+ if (!trans) {
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
+ }
+
+ ret = btrfs_block_rsv_check(trans, root,
+ root->orphan_block_rsv, 0, 5);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ trans = NULL;
+ continue;
+ }
+
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
+ trans = NULL;
btrfs_btree_balance_dirty(root, nr);
-
- trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
}
if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
struct inode *btrfs_alloc_inode(struct super_block *sb)
{
struct btrfs_inode *ei;
+ struct inode *inode;
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
+
+ ei->root = NULL;
+ ei->space_info = NULL;
+ ei->generation = 0;
+ ei->sequence = 0;
ei->last_trans = 0;
ei->last_sub_trans = 0;
ei->logged_trans = 0;
- ei->outstanding_extents = 0;
- ei->reserved_extents = 0;
- ei->root = NULL;
+ ei->delalloc_bytes = 0;
+ ei->reserved_bytes = 0;
+ ei->disk_i_size = 0;
+ ei->flags = 0;
+ ei->index_cnt = (u64)-1;
+ ei->last_unlink_trans = 0;
+
spin_lock_init(&ei->accounting_lock);
+ atomic_set(&ei->outstanding_extents, 0);
+ ei->reserved_extents = 0;
+
+ ei->ordered_data_close = 0;
+ ei->orphan_meta_reserved = 0;
+ ei->dummy_inode = 0;
+ ei->force_compress = 0;
+
+ inode = &ei->vfs_inode;
+ extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+ extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+ extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+ mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->ordered_operations);
- return &ei->vfs_inode;
+ RB_CLEAR_NODE(&ei->rb_node);
+
+ return inode;
}
void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
+ WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+ WARN_ON(BTRFS_I(inode)->reserved_extents);
/*
* This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
spin_unlock(&root->fs_info->ordered_extent_lock);
}
- spin_lock(&root->list_lock);
+ spin_lock(&root->orphan_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
inode->i_ino);
list_del_init(&BTRFS_I(inode)->i_orphan);
}
- spin_unlock(&root->list_lock);
+ spin_unlock(&root->orphan_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
-
- /*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
- * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- * should cover the worst case number of items we'll modify.
- */
- ret = btrfs_reserve_metadata_space(root, 11);
- if (ret)
- return ret;
-
/*
* we're using rename to replace one file with another.
* and the replacement file is large. Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* close the racy window with snapshot create/destroy ioctl */
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&root->fs_info->subvol_sem);
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ trans = btrfs_start_transaction(root, 20);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, new_dir);
if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
- btrfs_unreserve_metadata_space(root, 11);
return ret;
}
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return 0;
}
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+ struct btrfs_inode *binode;
+ struct inode *inode = NULL;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ while (!list_empty(&root->fs_info->delalloc_inodes)) {
+ binode = list_entry(root->fs_info->delalloc_inodes.next,
+ struct btrfs_inode, delalloc_inodes);
+ inode = igrab(&binode->vfs_inode);
+ if (inode) {
+ list_move_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ break;
+ }
+
+ list_del_init(&binode->delalloc_inodes);
+ cond_resched_lock(&root->fs_info->delalloc_lock);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ if (inode) {
+ write_inode_now(inode, 0);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ return 1;
+ }
+ return 0;
+}
+
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
/*
* 2 items for inode item and ref
* 2 items for dir items
* 1 item for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto out_fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-out_fail:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
return err;
}
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
- u64 alloc_hint, int mode, loff_t actual_len)
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
- u64 num_bytes = end - start;
int ret = 0;
- u64 i_size;
while (num_bytes > 0) {
- trans = btrfs_start_transaction(root, 1);
-
- ret = btrfs_reserve_extent(trans, root, num_bytes,
- root->sectorsize, 0, alloc_hint,
- (u64)-1, &ins, 1);
- if (ret) {
- WARN_ON(1);
- goto stop_trans;
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
}
- ret = btrfs_reserve_metadata_space(root, 3);
+ ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+ 0, *alloc_hint, (u64)-1, &ins, 1);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid,
- ins.offset);
- goto stop_trans;
+ btrfs_end_transaction(trans, root);
+ break;
}
ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
num_bytes -= ins.offset;
cur_offset += ins.offset;
- alloc_hint = ins.objectid + ins.offset;
+ *alloc_hint = ins.objectid + ins.offset;
inode->i_ctime = CURRENT_TIME;
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (actual_len > inode->i_size) &&
- (cur_offset > inode->i_size)) {
-
+ (actual_len > inode->i_size) &&
+ (cur_offset > inode->i_size)) {
if (cur_offset > actual_len)
- i_size = actual_len;
+ i_size_write(inode, actual_len);
else
- i_size = cur_offset;
- i_size_write(inode, i_size);
- btrfs_ordered_update_i_size(inode, i_size, NULL);
+ i_size_write(inode, cur_offset);
+ i_size_write(inode, cur_offset);
+ btrfs_ordered_update_i_size(inode, cur_offset, NULL);
}
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 3);
}
return ret;
-
-stop_trans:
- btrfs_end_transaction(trans, root);
- return ret;
-
}
static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
goto out;
}
- ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
- alloc_end - alloc_start);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
goto out;
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = prealloc_file_range(inode,
- cur_offset, last_byte,
- alloc_hint, mode, offset+len);
+ ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+ last_byte - cur_offset,
+ 1 << inode->i_blkbits,
+ offset + len,
+ &alloc_hint);
if (ret < 0) {
free_extent_map(em);
break;
}
}
- if (em->block_start <= EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
free_extent_map(em);
cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
- btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
- alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a86..4cdb98cf26d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
+ ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+ 0, &objectid);
+ if (ret)
+ return ret;
/*
* 1 - inode item
* 2 - refs
* 1 - root item
* 2 - dir items
*/
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
-
- ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
- 0, &objectid);
- if (ret)
- goto fail;
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
-
- btrfs_unreserve_metadata_space(root, 6);
return ret;
}
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
- char *name, int namelen)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
{
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
- /*
- * 1 - inode item
- * 2 - refs
- * 1 - root item
- * 2 - dir items
- */
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- goto fail;
-
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- btrfs_unreserve_metadata_space(root, 6);
- goto fail;
- }
- pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
- if (!pending_snapshot->name) {
- ret = -ENOMEM;
- kfree(pending_snapshot);
- btrfs_unreserve_metadata_space(root, 6);
- goto fail;
- }
- memcpy(pending_snapshot->name, name, namelen);
- pending_snapshot->name[namelen] = '\0';
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ btrfs_init_block_rsv(&pending_snapshot->block_rsv);
pending_snapshot->dentry = dentry;
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
pending_snapshot->root = root;
+
+ trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+
+ ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+ BUG_ON(ret);
+
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
BUG_ON(ret);
- btrfs_unreserve_metadata_space(root, 6);
+
+ ret = pending_snapshot->error;
+ if (ret)
+ goto fail;
+
+ btrfs_orphan_cleanup(pending_snapshot->snap);
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
d_instantiate(dentry, inode);
ret = 0;
fail:
+ kfree(pending_snapshot);
return ret;
}
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
if (snap_src) {
- error = create_snapshot(snap_src, dentry,
- name, namelen);
+ error = create_snapshot(snap_src, dentry);
} else {
error = create_subvol(BTRFS_I(dir)->root, dentry,
name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = 1;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret) {
- ret = -ENOSPC;
- break;
- }
-
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- PAGE_CACHE_SIZE);
- ret = -ENOSPC;
- break;
- }
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto err_unlock;
again:
if (inode->i_size == 0 ||
i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
}
page = grab_cache_page(inode->i_mapping, i);
- if (!page)
+ if (!page) {
+ ret = -ENOMEM;
goto err_reservations;
+ }
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ ret = -EIO;
goto err_reservations;
}
}
@@ -644,8 +623,7 @@ again:
wait_on_page_writeback(page);
if (PageDirty(page)) {
- btrfs_free_reserved_data_space(root, inode,
- PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto loop_unlock;
}
@@ -683,7 +661,6 @@ loop_unlock:
page_cache_release(page);
mutex_unlock(&inode->i_mutex);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
i++;
}
@@ -713,9 +690,9 @@ loop_unlock:
return 0;
err_reservations:
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
mutex_unlock(&inode->i_mutex);
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
return ret;
}
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
device->name, (unsigned long long)new_size);
if (new_size > old_size) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
} else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out_up_write;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.drop_level = 0;
btrfs_set_root_refs(&dest->root_item, 0);
- ret = btrfs_insert_orphan_item(trans,
- root->fs_info->tree_root,
- dest->root_key.objectid);
- BUG_ON(ret);
+ if (!xchg(&dest->orphan_item_inserted, 1)) {
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ BUG_ON(ret);
+ }
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EPERM;
goto out;
}
- btrfs_defrag_root(root, 0);
- btrfs_defrag_root(root->fs_info->extent_root, 0);
+ ret = btrfs_defrag_root(root, 0);
+ if (ret)
+ goto out;
+ ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
break;
case S_IFREG:
if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range->len = (u64)-1;
}
- btrfs_defrag_file(file, range);
+ ret = btrfs_defrag_file(file, range);
kfree(range);
break;
+ default:
+ ret = -EINVAL;
}
out:
mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_wait_ordered_range(src, off, off+len);
}
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
-
- /* punch hole in destination first */
- btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
-
/* clone data */
key.objectid = src->i_ino;
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* note the key will change type as we walk through the
* tree.
*/
- ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.objectid = inode->i_ino;
new_key.offset = key.offset + destoff - off;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ ret = btrfs_drop_extents(trans, inode,
+ new_key.offset,
+ new_key.offset + datal,
+ &hint_byte, 1);
+ BUG_ON(ret);
+
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- if (ret)
- goto out;
+ BUG_ON(ret);
leaf = path->nodes[0];
slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
- if (off > key.offset) {
- datao += off - key.offset;
- datal -= off - key.offset;
- }
-
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
/* disko == 0 means it's a hole */
if (!disko)
datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (comp && (skip || trim)) {
ret = -EINVAL;
+ btrfs_end_transaction(trans, root);
goto out;
}
size -= skip + trim;
datal -= skip + trim;
+
+ ret = btrfs_drop_extents(trans, inode,
+ new_key.offset,
+ new_key.offset + datal,
+ &hint_byte, 1);
+ BUG_ON(ret);
+
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- if (ret)
- goto out;
+ BUG_ON(ret);
if (skip) {
u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
btrfs_mark_buffer_dirty(leaf);
- }
+ btrfs_release_path(root, path);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (new_key.offset + datal > inode->i_size)
+ btrfs_i_size_write(inode,
+ new_key.offset + datal);
+ BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ btrfs_end_transaction(trans, root);
+ }
next:
btrfs_release_path(root, path);
key.offset++;
@@ -1717,17 +1727,7 @@ next:
ret = 0;
out:
btrfs_release_path(root, path);
- if (ret == 0) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- if (destoff + olen > inode->i_size)
- btrfs_i_size_write(inode, destoff + olen);
- BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
- ret = btrfs_update_inode(trans, root, inode);
- }
- btrfs_end_transaction(trans, root);
unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
- if (ret)
- vmtruncate(inode, 0);
out_unlock:
mutex_unlock(&src->i_mutex);
mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2d..e56c72bc5ad 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
return 1;
}
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+ u64 len)
+{
+ if (file_offset + len <= entry->file_offset ||
+ entry->file_offset + entry->len <= file_offset)
+ return 0;
+ return 1;
+}
+
/*
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type)
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int dio)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
+ if (dio)
+ set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+
/* one ref for the tree */
atomic_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
return 0;
}
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0);
+}
+
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 1);
+}
+
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- WARN_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
- inode, 1);
-
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
* start IO on any dirty ones so the wait doesn't stall waiting
* for pdflush to find them
*/
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -588,6 +609,47 @@ out:
return entry;
}
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ node = tree_search(tree, file_offset + len);
+ if (!node)
+ goto out;
+ }
+
+ while (1) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ break;
+
+ if (entry->file_offset >= file_offset + len) {
+ entry = NULL;
+ break;
+ }
+ entry = NULL;
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ if (entry)
+ atomic_inc(&entry->refs);
+ spin_unlock(&tree->lock);
+ return entry;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f04..8ac365492a3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int tyep);
+ u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941de..05d41e56923 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
struct backref_node {
struct rb_node rb_node;
u64 bytenr;
- /* objectid tree block owner */
+
+ u64 new_bytenr;
+ /* objectid of tree block owner, can be not uptodate */
u64 owner;
+ /* link to pending, changed or detached list */
+ struct list_head list;
/* list of upper level blocks reference this block */
struct list_head upper;
/* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
struct extent_buffer *eb;
/* level of tree block */
unsigned int level:8;
- /* 1 if the block is root of old snapshot */
- unsigned int old_root:1;
- /* 1 if no child blocks in the cache */
+ /* is the block in non-reference counted tree */
+ unsigned int cowonly:1;
+ /* 1 if no child node in the cache */
unsigned int lowest:1;
/* is the extent buffer locked */
unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
unsigned int processed:1;
/* have backrefs of this block been checked */
unsigned int checked:1;
+ /*
+ * 1 if corresponding block has been cowed but some upper
+ * level block pointers may not point to the new location
+ */
+ unsigned int pending:1;
+ /*
+ * 1 if the backref node isn't connected to any other
+ * backref node.
+ */
+ unsigned int detached:1;
};
/*
@@ -74,7 +88,6 @@ struct backref_node {
struct backref_edge {
struct list_head list[2];
struct backref_node *node[2];
- u64 blockptr;
};
#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
struct backref_cache {
/* red black tree of all backref nodes in the cache */
struct rb_root rb_root;
- /* list of backref nodes with no child block in the cache */
+ /* for passing backref nodes to btrfs_reloc_cow_block */
+ struct backref_node *path[BTRFS_MAX_LEVEL];
+ /*
+ * list of blocks that have been cowed but some block
+ * pointers in upper level blocks may not reflect the
+ * new location
+ */
struct list_head pending[BTRFS_MAX_LEVEL];
- spinlock_t lock;
+ /* list of backref nodes with no child node */
+ struct list_head leaves;
+ /* list of blocks that have been cowed in current transaction */
+ struct list_head changed;
+ /* list of detached backref node. */
+ struct list_head detached;
+
+ u64 last_trans;
+
+ int nr_nodes;
+ int nr_edges;
};
/*
@@ -113,15 +142,6 @@ struct tree_block {
unsigned int key_ready:1;
};
-/* inode vector */
-#define INODEVEC_SIZE 16
-
-struct inodevec {
- struct list_head list;
- struct inode *inode[INODEVEC_SIZE];
- int nr;
-};
-
#define MAX_EXTENTS 128
struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
struct btrfs_root *extent_root;
/* inode for moving data */
struct inode *data_inode;
- struct btrfs_workers workers;
+
+ struct btrfs_block_rsv *block_rsv;
+
+ struct backref_cache backref_cache;
+
+ struct file_extent_cluster cluster;
/* tree blocks have been processed */
struct extent_io_tree processed_blocks;
/* map start of tree root to corresponding reloc tree */
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* size of metadata reservation for merging reloc trees */
+ u64 merging_rsv_size;
+ /* size of relocated tree nodes */
+ u64 nodes_relocated;
+
u64 search_start;
u64 extents_found;
- u64 extents_skipped;
- int stage;
- int create_reloc_root;
+
+ int block_rsv_retries;
+
+ unsigned int stage:8;
+ unsigned int create_reloc_tree:1;
+ unsigned int merge_reloc_tree:1;
unsigned int found_file_extent:1;
- unsigned int found_old_snapshot:1;
+ unsigned int commit_transaction:1;
};
/* stages of data relocation */
#define MOVE_DATA_EXTENTS 0
#define UPDATE_DATA_PTRS 1
-/*
- * merge reloc tree to corresponding fs tree in worker threads
- */
-struct async_merge {
- struct btrfs_work work;
- struct reloc_control *rc;
- struct btrfs_root *root;
- struct completion *done;
- atomic_t *num_pending;
-};
+static void remove_backref_node(struct backref_cache *cache,
+ struct backref_node *node);
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node);
static void mapping_tree_init(struct mapping_tree *tree)
{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
cache->rb_root = RB_ROOT;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&cache->pending[i]);
- spin_lock_init(&cache->lock);
+ INIT_LIST_HEAD(&cache->changed);
+ INIT_LIST_HEAD(&cache->detached);
+ INIT_LIST_HEAD(&cache->leaves);
+}
+
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int i;
+
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->leaves)) {
+ node = list_entry(cache->leaves.next,
+ struct backref_node, lower);
+ remove_backref_node(cache, node);
+ }
+
+ cache->last_trans = 0;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ BUG_ON(!list_empty(&cache->pending[i]));
+ BUG_ON(!list_empty(&cache->changed));
+ BUG_ON(!list_empty(&cache->detached));
+ BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+ BUG_ON(cache->nr_nodes);
+ BUG_ON(cache->nr_edges);
+}
+
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+ struct backref_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (node) {
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+ cache->nr_nodes++;
+ }
+ return node;
+}
+
+static void free_backref_node(struct backref_cache *cache,
+ struct backref_node *node)
+{
+ if (node) {
+ cache->nr_nodes--;
+ kfree(node);
+ }
+}
+
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+ struct backref_edge *edge;
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (edge)
+ cache->nr_edges++;
+ return edge;
}
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+ struct backref_edge *edge)
{
- memset(node, 0, sizeof(*node));
- INIT_LIST_HEAD(&node->upper);
- INIT_LIST_HEAD(&node->lower);
- RB_CLEAR_NODE(&node->rb_node);
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
}
static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
edges[idx++] = edge;
node = edge->node[UPPER];
}
+ BUG_ON(node->detached);
*index = idx;
return node;
}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
return NULL;
}
+static void unlock_node_buffer(struct backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
static void drop_node_buffer(struct backref_node *node)
{
if (node->eb) {
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
+ unlock_node_buffer(node);
free_extent_buffer(node->eb);
node->eb = NULL;
}
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
static void drop_backref_node(struct backref_cache *tree,
struct backref_node *node)
{
- BUG_ON(!node->lowest);
BUG_ON(!list_empty(&node->upper));
drop_node_buffer(node);
+ list_del(&node->list);
list_del(&node->lower);
-
- rb_erase(&node->rb_node, &tree->rb_root);
- kfree(node);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ free_backref_node(tree, node);
}
/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
if (!node)
return;
- BUG_ON(!node->lowest);
+ BUG_ON(!node->lowest && !node->detached);
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next, struct backref_edge,
list[LOWER]);
upper = edge->node[UPPER];
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
- kfree(edge);
+ free_backref_edge(cache, edge);
+
+ if (RB_EMPTY_NODE(&upper->rb_node)) {
+ BUG_ON(!list_empty(&node->upper));
+ drop_backref_node(cache, node);
+ node = upper;
+ node->lowest = 1;
+ continue;
+ }
/*
- * add the node to pending list if no other
+ * add the node to leaf node list if no other
* child block cached.
*/
if (list_empty(&upper->lower)) {
- list_add_tail(&upper->lower,
- &cache->pending[upper->level]);
+ list_add_tail(&upper->lower, &cache->leaves);
upper->lowest = 1;
}
}
+
drop_backref_node(cache, node);
}
+static void update_backref_node(struct backref_cache *cache,
+ struct backref_node *node, u64 bytenr)
+{
+ struct rb_node *rb_node;
+ rb_erase(&node->rb_node, &cache->rb_root);
+ node->bytenr = bytenr;
+ rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ BUG_ON(rb_node);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+ struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int level = 0;
+
+ if (cache->last_trans == 0) {
+ cache->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (cache->last_trans == trans->transid)
+ return 0;
+
+ /*
+ * detached nodes are used to avoid unnecessary backref
+ * lookup. transaction commit changes the extent tree.
+ * so the detached nodes are no longer useful.
+ */
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->changed)) {
+ node = list_entry(cache->changed.next,
+ struct backref_node, list);
+ list_del_init(&node->list);
+ BUG_ON(node->pending);
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+
+ /*
+ * some nodes can be left in the pending list if there were
+ * errors during processing the pending nodes.
+ */
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ list_for_each_entry(node, &cache->pending[level], list) {
+ BUG_ON(!node->pending);
+ if (node->bytenr == node->new_bytenr)
+ continue;
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+ }
+
+ cache->last_trans = 0;
+ return 1;
+}
+
+static int should_ignore_root(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+
+ if (!root->ref_cows)
+ return 0;
+
+ reloc_root = root->reloc_root;
+ if (!reloc_root)
+ return 0;
+
+ if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+ root->fs_info->running_transaction->transid - 1)
+ return 0;
+ /*
+ * if there is reloc tree and it was created in previous
+ * transaction backref lookup can find the reloc tree,
+ * so backref node for the fs tree root is useless for
+ * relocation.
+ */
+ return 1;
+}
+
/*
* find reloc tree by address of tree root
*/
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
* for all upper level blocks that directly/indirectly reference the
* block are also cached.
*/
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
- struct backref_cache *cache,
- struct btrfs_key *node_key,
- int level, u64 bytenr)
+static noinline_for_stack
+struct backref_node *build_backref_tree(struct reloc_control *rc,
+ struct btrfs_key *node_key,
+ int level, u64 bytenr)
{
+ struct backref_cache *cache = &rc->backref_cache;
struct btrfs_path *path1;
struct btrfs_path *path2;
struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
unsigned long end;
unsigned long ptr;
LIST_HEAD(list);
+ LIST_HEAD(useless);
+ int cowonly;
int ret;
int err = 0;
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
goto out;
}
- node = kmalloc(sizeof(*node), GFP_NOFS);
+ node = alloc_backref_node(cache);
if (!node) {
err = -ENOMEM;
goto out;
}
- backref_node_init(node);
node->bytenr = bytenr;
- node->owner = 0;
node->level = level;
node->lowest = 1;
cur = node;
@@ -587,17 +780,20 @@ again:
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
key.type == BTRFS_EXTENT_REF_V0_KEY) {
- if (key.objectid == key.offset &&
- key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
struct btrfs_extent_ref_v0 *ref0;
ref0 = btrfs_item_ptr(eb, path1->slots[0],
struct btrfs_extent_ref_v0);
root = find_tree_root(rc, eb, ref0);
- if (root)
- cur->root = root;
- else
- cur->old_root = 1;
- break;
+ if (!root->ref_cows)
+ cur->cowonly = 1;
+ if (key.objectid == key.offset) {
+ if (root && !should_ignore_root(root))
+ cur->root = root;
+ else
+ list_add(&cur->list, &useless);
+ break;
+ }
}
#else
BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +810,20 @@ again:
break;
}
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ edge = alloc_backref_edge(cache);
if (!edge) {
err = -ENOMEM;
goto out;
}
rb_node = tree_search(&cache->rb_root, key.offset);
if (!rb_node) {
- upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ upper = alloc_backref_node(cache);
if (!upper) {
- kfree(edge);
+ free_backref_edge(cache, edge);
err = -ENOMEM;
goto out;
}
- backref_node_init(upper);
upper->bytenr = key.offset;
- upper->owner = 0;
upper->level = cur->level + 1;
/*
* backrefs for the upper level block isn't
@@ -639,11 +833,12 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
+ BUG_ON(!upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
- list_add(&edge->list[LOWER], &cur->upper);
- edge->node[UPPER] = upper;
+ list_add_tail(&edge->list[LOWER], &cur->upper);
edge->node[LOWER] = cur;
+ edge->node[UPPER] = upper;
goto next;
} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +852,17 @@ again:
goto out;
}
+ if (!root->ref_cows)
+ cur->cowonly = 1;
+
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
BUG_ON(btrfs_root_bytenr(&root->root_item) !=
cur->bytenr);
- cur->root = root;
+ if (should_ignore_root(root))
+ list_add(&cur->list, &useless);
+ else
+ cur->root = root;
break;
}
@@ -692,11 +893,14 @@ again:
if (!path2->nodes[level]) {
BUG_ON(btrfs_root_bytenr(&root->root_item) !=
lower->bytenr);
- lower->root = root;
+ if (should_ignore_root(root))
+ list_add(&lower->list, &useless);
+ else
+ lower->root = root;
break;
}
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ edge = alloc_backref_edge(cache);
if (!edge) {
err = -ENOMEM;
goto out;
@@ -705,16 +909,17 @@ again:
eb = path2->nodes[level];
rb_node = tree_search(&cache->rb_root, eb->start);
if (!rb_node) {
- upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ upper = alloc_backref_node(cache);
if (!upper) {
- kfree(edge);
+ free_backref_edge(cache, edge);
err = -ENOMEM;
goto out;
}
- backref_node_init(upper);
upper->bytenr = eb->start;
upper->owner = btrfs_header_owner(eb);
upper->level = lower->level + 1;
+ if (!root->ref_cows)
+ upper->cowonly = 1;
/*
* if we know the block isn't shared
@@ -744,10 +949,12 @@ again:
rb_node);
BUG_ON(!upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
+ if (!upper->owner)
+ upper->owner = btrfs_header_owner(eb);
}
list_add_tail(&edge->list[LOWER], &lower->upper);
- edge->node[UPPER] = upper;
edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
if (rb_node)
break;
@@ -785,8 +992,13 @@ next:
* into the cache.
*/
BUG_ON(!node->checked);
- rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
- BUG_ON(rb_node);
+ cowonly = node->cowonly;
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, node->bytenr,
+ &node->rb_node);
+ BUG_ON(rb_node);
+ list_add_tail(&node->lower, &cache->leaves);
+ }
list_for_each_entry(edge, &node->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1007,14 @@ next:
edge = list_entry(list.next, struct backref_edge, list[UPPER]);
list_del_init(&edge->list[UPPER]);
upper = edge->node[UPPER];
+ if (upper->detached) {
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ continue;
+ }
if (!RB_EMPTY_NODE(&upper->rb_node)) {
if (upper->lowest) {
@@ -807,25 +1027,69 @@ next:
}
BUG_ON(!upper->checked);
- rb_node = tree_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
- BUG_ON(rb_node);
+ BUG_ON(cowonly != upper->cowonly);
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ BUG_ON(rb_node);
+ }
list_add_tail(&edge->list[UPPER], &upper->lower);
list_for_each_entry(edge, &upper->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
}
+ /*
+ * process useless backref nodes. backref nodes for tree leaves
+ * are deleted from the cache. backref nodes for upper level
+ * tree blocks are left in the cache to avoid unnecessary backref
+ * lookup.
+ */
+ while (!list_empty(&useless)) {
+ upper = list_entry(useless.next, struct backref_node, list);
+ list_del_init(&upper->list);
+ BUG_ON(!list_empty(&upper->upper));
+ if (upper == node)
+ node = NULL;
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+ while (!list_empty(&upper->lower)) {
+ edge = list_entry(upper->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ }
+ __mark_block_processed(rc, upper);
+ if (upper->level > 0) {
+ list_add(&upper->list, &cache->detached);
+ upper->detached = 1;
+ } else {
+ rb_erase(&upper->rb_node, &cache->rb_root);
+ free_backref_node(cache, upper);
+ }
+ }
out:
btrfs_free_path(path1);
btrfs_free_path(path2);
if (err) {
- INIT_LIST_HEAD(&list);
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, upper);
+ list_del_init(&lower->upper);
+ }
upper = node;
+ INIT_LIST_HEAD(&list);
while (upper) {
if (RB_EMPTY_NODE(&upper->rb_node)) {
list_splice_tail(&upper->upper, &list);
- kfree(upper);
+ free_backref_node(cache, upper);
}
if (list_empty(&list))
@@ -833,15 +1097,104 @@ out:
edge = list_entry(list.next, struct backref_edge,
list[LOWER]);
+ list_del(&edge->list[LOWER]);
upper = edge->node[UPPER];
- kfree(edge);
+ free_backref_edge(cache, edge);
}
return ERR_PTR(err);
}
+ BUG_ON(node && node->detached);
return node;
}
/*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *src,
+ struct btrfs_root *dest)
+{
+ struct btrfs_root *reloc_root = src->reloc_root;
+ struct backref_cache *cache = &rc->backref_cache;
+ struct backref_node *node = NULL;
+ struct backref_node *new_node;
+ struct backref_edge *edge;
+ struct backref_edge *new_edge;
+ struct rb_node *rb_node;
+
+ if (cache->last_trans > 0)
+ update_backref_cache(trans, cache);
+
+ rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node, rb_node);
+ if (node->detached)
+ node = NULL;
+ else
+ BUG_ON(node->new_bytenr != reloc_root->node->start);
+ }
+
+ if (!node) {
+ rb_node = tree_search(&cache->rb_root,
+ reloc_root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ BUG_ON(node->detached);
+ }
+ }
+
+ if (!node)
+ return 0;
+
+ new_node = alloc_backref_node(cache);
+ if (!new_node)
+ return -ENOMEM;
+
+ new_node->bytenr = dest->node->start;
+ new_node->level = node->level;
+ new_node->lowest = node->lowest;
+ new_node->root = dest;
+
+ if (!node->lowest) {
+ list_for_each_entry(edge, &node->lower, list[UPPER]) {
+ new_edge = alloc_backref_edge(cache);
+ if (!new_edge)
+ goto fail;
+
+ new_edge->node[UPPER] = new_node;
+ new_edge->node[LOWER] = edge->node[LOWER];
+ list_add_tail(&new_edge->list[UPPER],
+ &new_node->lower);
+ }
+ }
+
+ rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+ &new_node->rb_node);
+ BUG_ON(rb_node);
+
+ if (!new_node->lowest) {
+ list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+ list_add_tail(&new_edge->list[LOWER],
+ &new_edge->node[LOWER]->upper);
+ }
+ }
+ return 0;
+fail:
+ while (!list_empty(&new_node->lower)) {
+ new_edge = list_entry(new_node->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&new_edge->list[UPPER]);
+ free_backref_edge(cache, new_edge);
+ }
+ free_backref_node(cache, new_node);
+ return -ENOMEM;
+}
+
+/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
return 0;
}
-/*
- * create reloc tree for a given fs tree. reloc tree is just a
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
{
struct btrfs_root *reloc_root;
struct extent_buffer *eb;
@@ -914,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_key root_key;
int ret;
- if (root->reloc_root) {
- reloc_root = root->reloc_root;
- reloc_root->last_trans = trans->transid;
- return 0;
- }
-
- if (!root->fs_info->reloc_ctl ||
- !root->fs_info->reloc_ctl->create_reloc_root ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
- return 0;
-
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
BUG_ON(!root_item);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = root->root_key.objectid;
+ root_key.offset = objectid;
- ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
- BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (root->root_key.objectid == objectid) {
+ /* called by btrfs_init_reloc_root */
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ btrfs_set_root_last_snapshot(&root->root_item,
+ trans->transid - 1);
+ } else {
+ /*
+ * called by btrfs_reloc_post_snapshot_hook.
+ * the source tree is a reloc tree, all tree blocks
+ * modified after it was created have RELOC flag
+ * set in their headers. so it's OK to not update
+ * the 'last_snapshot'.
+ */
+ ret = btrfs_copy_root(trans, root, root->node, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+ }
- btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
memcpy(root_item, &root->root_item, sizeof(*root_item));
- btrfs_set_root_refs(root_item, 1);
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
btrfs_set_root_generation(root_item, trans->transid);
- memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
- root_item->drop_level = 0;
+
+ if (root->root_key.objectid == objectid) {
+ btrfs_set_root_refs(root_item, 0);
+ memset(&root_item->drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ root_item->drop_level = 0;
+ }
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -957,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
&root_key);
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
+ return reloc_root;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+ int clear_rsv = 0;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ reloc_root->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (!rc || !rc->create_reloc_tree ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!trans->block_rsv) {
+ trans->block_rsv = rc->block_rsv;
+ clear_rsv = 1;
+ }
+ reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ if (clear_rsv)
+ trans->block_rsv = NULL;
__add_reloc_root(reloc_root);
root->reloc_root = reloc_root;
@@ -980,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
- if (btrfs_root_refs(root_item) == 0) {
+ if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+ btrfs_root_refs(root_item) == 0) {
root->reloc_root = NULL;
del = 1;
}
@@ -1102,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
goto out;
}
- if (new_bytenr)
- *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
ret = 0;
out:
btrfs_free_path(path);
@@ -1114,19 +1503,18 @@ out:
* update file extent items in the tree leaf to point to
* the new locations.
*/
-static int replace_file_extents(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_root *root,
- struct extent_buffer *leaf,
- struct list_head *inode_list)
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf)
{
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct inode *inode = NULL;
- struct inodevec *ivec = NULL;
u64 parent;
u64 bytenr;
- u64 new_bytenr;
+ u64 new_bytenr = 0;
u64 num_bytes;
u64 end;
u32 nritems;
@@ -1166,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
* to complete and drop the extent cache
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
- if (!ivec || ivec->nr == INODEVEC_SIZE) {
- ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
- BUG_ON(!ivec);
- ivec->nr = 0;
- list_add_tail(&ivec->list, inode_list);
- }
if (first) {
inode = find_next_inode(root, key.objectid);
- if (inode)
- ivec->inode[ivec->nr++] = inode;
first = 0;
} else if (inode && inode->i_ino < key.objectid) {
+ btrfs_add_delayed_iput(inode);
inode = find_next_inode(root, key.objectid);
- if (inode)
- ivec->inode[ivec->nr++] = inode;
}
if (inode && inode->i_ino == key.objectid) {
end = key.offset +
@@ -1204,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
ret = get_new_location(rc->data_inode, &new_bytenr,
bytenr, num_bytes);
- if (ret > 0)
+ if (ret > 0) {
+ WARN_ON(1);
continue;
+ }
BUG_ON(ret < 0);
btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
}
if (dirty)
btrfs_mark_buffer_dirty(leaf);
+ if (inode)
+ btrfs_add_delayed_iput(inode);
return 0;
}
@@ -1248,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* if no block got replaced, 0 is returned. if there are other
* errors, a negative error number is returned.
*/
-static int replace_path(struct btrfs_trans_handle *trans,
- struct btrfs_root *dest, struct btrfs_root *src,
- struct btrfs_path *path, struct btrfs_key *next_key,
- struct extent_buffer **leaf,
- int lowest_level, int max_level)
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dest, struct btrfs_root *src,
+ struct btrfs_path *path, struct btrfs_key *next_key,
+ int lowest_level, int max_level)
{
struct extent_buffer *eb;
struct extent_buffer *parent;
@@ -1263,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
u64 new_ptr_gen;
u64 last_snapshot;
u32 blocksize;
+ int cow = 0;
int level;
int ret;
int slot;
BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(lowest_level > 1 && leaf);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
-
+again:
slot = path->slots[lowest_level];
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
@@ -1286,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
return 0;
}
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
- BUG_ON(ret);
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ BUG_ON(ret);
+ }
btrfs_set_lock_blocking(eb);
if (next_key) {
@@ -1331,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
memcmp_node_keys(parent, slot, path, level)) {
- if (level <= lowest_level && !leaf) {
+ if (level <= lowest_level) {
ret = 0;
break;
}
@@ -1339,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
eb = read_tree_block(dest, old_bytenr, blocksize,
old_ptr_gen);
btrfs_tree_lock(eb);
- ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
- BUG_ON(ret);
- btrfs_set_lock_blocking(eb);
-
- if (level <= lowest_level) {
- *leaf = eb;
- ret = 0;
- break;
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, parent,
+ slot, &eb);
+ BUG_ON(ret);
}
+ btrfs_set_lock_blocking(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1357,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
continue;
}
+ if (!cow) {
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ cow = 1;
+ goto again;
+ }
+
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
btrfs_release_path(src, path);
@@ -1562,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
return 0;
}
-static void put_inodes(struct list_head *list)
-{
- struct inodevec *ivec;
- while (!list_empty(list)) {
- ivec = list_entry(list->next, struct inodevec, list);
- list_del(&ivec->list);
- while (ivec->nr > 0) {
- ivec->nr--;
- iput(ivec->inode[ivec->nr]);
- }
- kfree(ivec);
- }
-}
-
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key)
@@ -1608,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
struct btrfs_path *path;
- struct extent_buffer *leaf = NULL;
+ struct extent_buffer *leaf;
unsigned long nr;
int level;
int max_level;
int replaced = 0;
int ret;
int err = 0;
+ u32 min_reserved;
path = btrfs_alloc_path();
if (!path)
@@ -1648,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
- trans = btrfs_start_transaction(root, 1);
+ min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ memset(&next_key, 0, sizeof(next_key));
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(reloc_root, path);
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ trans->block_rsv = rc->block_rsv;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto out;
+ ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
+ min_reserved, 0);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ continue;
}
- leaf = path->nodes[0];
- btrfs_unlock_up_safe(path, 1);
- ret = replace_file_extents(trans, rc, root, leaf,
- &inode_list);
- if (ret < 0)
- err = ret;
- goto out;
- }
-
- memset(&next_key, 0, sizeof(next_key));
-
- while (1) {
- leaf = NULL;
replaced = 0;
- trans = btrfs_start_transaction(root, 1);
max_level = level;
ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (!find_next_key(path, level, &key) &&
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
- } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
- ret = replace_path(trans, root, reloc_root,
- path, &next_key, &leaf,
- level, max_level);
} else {
- ret = replace_path(trans, root, reloc_root,
- path, &next_key, NULL,
- level, max_level);
+ ret = replace_path(trans, root, reloc_root, path,
+ &next_key, level, max_level);
}
if (ret < 0) {
err = ret;
@@ -1708,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
replaced = 1;
- } else if (leaf) {
- /*
- * no block got replaced, try replacing file extents
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- ret = replace_file_extents(trans, rc, root, leaf,
- &inode_list);
- btrfs_tree_unlock(leaf);
- free_extent_buffer(leaf);
- BUG_ON(ret < 0);
}
ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
root_item->drop_level = level;
nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
- /*
- * put inodes outside transaction, otherwise we may deadlock.
- */
- put_inodes(&inode_list);
-
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
}
@@ -1765,87 +2109,125 @@ out:
sizeof(root_item->drop_progress));
root_item->drop_level = 0;
btrfs_set_root_refs(root_item, 0);
+ btrfs_update_reloc_root(trans, root);
}
nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
- put_inodes(&inode_list);
-
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
return err;
}
-/*
- * callback for the work threads.
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
+ struct btrfs_root *root = rc->extent_root;
struct btrfs_root *reloc_root;
- struct async_merge *async;
+ struct btrfs_trans_handle *trans;
+ LIST_HEAD(reloc_roots);
+ u64 num_bytes = 0;
+ int ret;
+ int retries = 0;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ rc->merging_rsv_size += rc->nodes_relocated * 2;
+ mutex_unlock(&root->fs_info->trans_mutex);
+again:
+ if (!err) {
+ num_bytes = rc->merging_rsv_size;
+ ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+ num_bytes, &retries);
+ if (ret)
+ err = ret;
+ }
+
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+
+ if (!err) {
+ if (num_bytes != rc->merging_rsv_size) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_block_rsv_release(rc->extent_root,
+ rc->block_rsv, num_bytes);
+ retries = 0;
+ goto again;
+ }
+ }
- async = container_of(work, struct async_merge, work);
- reloc_root = async->root;
+ rc->merge_reloc_tree = 1;
+
+ while (!list_empty(&rc->reloc_roots)) {
+ reloc_root = list_entry(rc->reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del_init(&reloc_root->root_list);
- if (btrfs_root_refs(&reloc_root->root_item) > 0) {
root = read_fs_root(reloc_root->fs_info,
reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
- merge_reloc_root(async->rc, root);
-
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * set reference count to 1, so btrfs_recover_relocation
+ * knows it should resumes merging
+ */
+ if (!err)
+ btrfs_set_root_refs(&reloc_root->root_item, 1);
btrfs_update_reloc_root(trans, root);
- btrfs_end_transaction(trans, root);
- }
- btrfs_drop_snapshot(reloc_root, 0);
+ list_add(&reloc_root->root_list, &reloc_roots);
+ }
- if (atomic_dec_and_test(async->num_pending))
- complete(async->done);
+ list_splice(&reloc_roots, &rc->reloc_roots);
- kfree(async);
+ if (!err)
+ btrfs_commit_transaction(trans, rc->extent_root);
+ else
+ btrfs_end_transaction(trans, rc->extent_root);
+ return err;
}
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
{
- struct async_merge *async;
struct btrfs_root *root;
- struct completion done;
- atomic_t num_pending;
+ struct btrfs_root *reloc_root;
+ LIST_HEAD(reloc_roots);
+ int found = 0;
+ int ret;
+again:
+ root = rc->extent_root;
+ mutex_lock(&root->fs_info->trans_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&root->fs_info->trans_mutex);
- init_completion(&done);
- atomic_set(&num_pending, 1);
+ while (!list_empty(&reloc_roots)) {
+ found = 1;
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
- while (!list_empty(&rc->reloc_roots)) {
- root = list_entry(rc->reloc_roots.next,
- struct btrfs_root, root_list);
- list_del_init(&root->root_list);
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ root = read_fs_root(reloc_root->fs_info,
+ reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
- async = kmalloc(sizeof(*async), GFP_NOFS);
- BUG_ON(!async);
- async->work.func = merge_func;
- async->work.flags = 0;
- async->rc = rc;
- async->root = root;
- async->done = &done;
- async->num_pending = &num_pending;
- atomic_inc(&num_pending);
- btrfs_queue_worker(&rc->workers, &async->work);
+ ret = merge_reloc_root(rc, root);
+ BUG_ON(ret);
+ } else {
+ list_del_init(&reloc_root->root_list);
+ }
+ btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
}
- if (!atomic_dec_and_test(&num_pending))
- wait_for_completion(&done);
-
+ if (found) {
+ found = 0;
+ goto again;
+ }
BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
return 0;
}
@@ -1876,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
return btrfs_record_root_in_trans(trans, root);
}
-/*
- * select one tree from trees that references the block.
- * for blocks in refernce counted trees, we preper reloc tree.
- * if no reloc tree found and reloc_only is true, NULL is returned.
- */
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
- struct backref_node *node,
- struct backref_edge *edges[],
- int *nr, int reloc_only)
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct backref_edge *edges[], int *nr)
{
struct backref_node *next;
struct btrfs_root *root;
- int index;
- int loop = 0;
-again:
- index = 0;
+ int index = 0;
+
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- if (!root) {
- BUG_ON(!node->old_root);
- goto skip;
- }
-
- /* no other choice for non-refernce counted tree */
- if (!root->ref_cows) {
- BUG_ON(reloc_only);
- break;
- }
+ BUG_ON(!root);
+ BUG_ON(!root->ref_cows);
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
break;
}
- if (loop) {
- btrfs_record_root_in_trans(trans, root);
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+
+ if (next->new_bytenr != root->node->start) {
+ BUG_ON(next->new_bytenr);
+ BUG_ON(!list_empty(&next->list));
+ next->new_bytenr = root->node->start;
+ next->root = root;
+ list_add_tail(&next->list,
+ &rc->backref_cache.changed);
+ __mark_block_processed(rc, next);
break;
}
- if (reloc_only || next != node) {
- if (!root->reloc_root)
- btrfs_record_root_in_trans(trans, root);
- root = root->reloc_root;
- /*
- * if the reloc tree was created in current
- * transation, there is no node in backref tree
- * corresponds to the root of the reloc tree.
- */
- if (btrfs_root_last_snapshot(&root->root_item) ==
- trans->transid - 1)
- break;
- }
-skip:
+ WARN_ON(1);
root = NULL;
next = walk_down_backref(edges, &index);
if (!next || next->level <= node->level)
break;
}
+ if (!root)
+ return NULL;
- if (!root && !loop && !reloc_only) {
- loop = 1;
- goto again;
+ *nr = index;
+ next = node;
+ /* setup backref node path for btrfs_reloc_cow_block */
+ while (1) {
+ rc->backref_cache.path[next->level] = next;
+ if (--index < 0)
+ break;
+ next = edges[index]->node[UPPER];
}
-
- if (root)
- *nr = index;
- else
- *nr = 0;
-
return root;
}
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
static noinline_for_stack
struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
struct backref_node *node)
{
+ struct backref_node *next;
+ struct btrfs_root *root;
+ struct btrfs_root *fs_root = NULL;
struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
- int nr;
- return __select_one_root(trans, node, edges, &nr, 0);
+ int index = 0;
+
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+ BUG_ON(!root);
+
+ /* no other choice for non-refernce counted tree */
+ if (!root->ref_cows)
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ fs_root = root;
+
+ if (next != node)
+ return NULL;
+
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+
+ if (!fs_root)
+ return ERR_PTR(-ENOENT);
+ return fs_root;
}
static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
- struct backref_node *node,
- struct backref_edge *edges[], int *nr)
+u64 calcu_metadata_size(struct reloc_control *rc,
+ struct backref_node *node, int reserve)
{
- return __select_one_root(trans, node, edges, nr, 1);
+ struct backref_node *next = node;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ u64 num_bytes = 0;
+ int index = 0;
+
+ BUG_ON(reserve && node->processed);
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed && (reserve || next != node))
+ break;
+
+ num_bytes += btrfs_level_size(rc->extent_root,
+ next->level);
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+ return num_bytes;
}
-static void grab_path_buffers(struct btrfs_path *path,
- struct backref_node *node,
- struct backref_edge *edges[], int nr)
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node)
{
- int i = 0;
- while (1) {
- drop_node_buffer(node);
- node->eb = path->nodes[node->level];
- BUG_ON(!node->eb);
- if (path->locks[node->level])
- node->locked = 1;
- path->nodes[node->level] = NULL;
- path->locks[node->level] = 0;
-
- if (i >= nr)
- break;
+ struct btrfs_root *root = rc->extent_root;
+ u64 num_bytes;
+ int ret;
+
+ num_bytes = calcu_metadata_size(rc, node, 1) * 2;
- edges[i]->blockptr = node->eb->start;
- node = edges[i]->node[UPPER];
- i++;
+ trans->block_rsv = rc->block_rsv;
+ ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
+ &rc->block_rsv_retries);
+ if (ret) {
+ if (ret == -EAGAIN)
+ rc->commit_transaction = 1;
+ return ret;
}
+
+ rc->block_rsv_retries = 0;
+ return 0;
+}
+
+static void release_metadata_space(struct reloc_control *rc,
+ struct backref_node *node)
+{
+ u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
}
/*
@@ -1999,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path,
* in that case this function just updates pointers.
*/
static int do_relocation(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
struct backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
@@ -2019,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
BUG_ON(lowest && node->eb);
path->lowest_level = node->level + 1;
+ rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
cond_resched();
- if (node->eb && node->eb->start == edge->blockptr)
- continue;
upper = edge->node[UPPER];
- root = select_reloc_root(trans, upper, edges, &nr);
- if (!root)
- continue;
-
- if (upper->eb && !upper->locked)
+ root = select_reloc_root(trans, rc, upper, edges, &nr);
+ BUG_ON(!root);
+
+ if (upper->eb && !upper->locked) {
+ if (!lowest) {
+ ret = btrfs_bin_search(upper->eb, key,
+ upper->level, &slot);
+ BUG_ON(ret);
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (node->eb->start == bytenr)
+ goto next;
+ }
drop_node_buffer(upper);
+ }
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
BUG_ON(ret > 0);
- slot = path->slots[upper->level];
+ if (!upper->eb) {
+ upper->eb = path->nodes[upper->level];
+ path->nodes[upper->level] = NULL;
+ } else {
+ BUG_ON(upper->eb != path->nodes[upper->level]);
+ }
- btrfs_unlock_up_safe(path, upper->level + 1);
- grab_path_buffers(path, upper, edges, nr);
+ upper->locked = 1;
+ path->locks[upper->level] = 0;
+ slot = path->slots[upper->level];
btrfs_release_path(NULL, path);
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(upper->eb, slot);
- if (!lowest) {
- if (node->eb->start == bytenr) {
- btrfs_tree_unlock(upper->eb);
- upper->locked = 0;
- continue;
- }
+ if (lowest) {
+ BUG_ON(bytenr != node->bytenr);
} else {
- BUG_ON(node->bytenr != bytenr);
+ if (node->eb->start == bytenr)
+ goto next;
}
blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
slot, &eb);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
if (ret < 0) {
err = ret;
- break;
+ goto next;
}
- btrfs_set_lock_blocking(eb);
- node->eb = eb;
- node->locked = 1;
+ BUG_ON(node->eb != eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@@ -2096,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
BUG_ON(ret);
}
- if (!lowest) {
- btrfs_tree_unlock(upper->eb);
- upper->locked = 0;
- }
+next:
+ if (!upper->pending)
+ drop_node_buffer(upper);
+ else
+ unlock_node_buffer(upper);
+ if (err)
+ break;
}
+
+ if (!err && node->pending) {
+ drop_node_buffer(node);
+ list_move_tail(&node->list, &rc->backref_cache.changed);
+ node->pending = 0;
+ }
+
path->lowest_level = 0;
+ BUG_ON(err == -ENOSPC);
return err;
}
static int link_to_upper(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
struct backref_node *node,
struct btrfs_path *path)
{
struct btrfs_key key;
- if (!node->eb || list_empty(&node->upper))
- return 0;
btrfs_node_key_to_cpu(node->eb, &key, 0);
- return do_relocation(trans, node, &key, path, 0);
+ return do_relocation(trans, rc, node, &key, path, 0);
}
static int finish_pending_nodes(struct btrfs_trans_handle *trans,
- struct backref_cache *cache,
- struct btrfs_path *path)
+ struct reloc_control *rc,
+ struct btrfs_path *path, int err)
{
+ LIST_HEAD(list);
+ struct backref_cache *cache = &rc->backref_cache;
struct backref_node *node;
int level;
int ret;
- int err = 0;
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
node = list_entry(cache->pending[level].next,
- struct backref_node, lower);
- BUG_ON(node->level != level);
+ struct backref_node, list);
+ list_move_tail(&node->list, &list);
+ BUG_ON(!node->pending);
- ret = link_to_upper(trans, node, path);
- if (ret < 0)
- err = ret;
- /*
- * this remove the node from the pending list and
- * may add some other nodes to the level + 1
- * pending list
- */
- remove_backref_node(cache, node);
+ if (!err) {
+ ret = link_to_upper(trans, rc, node, path);
+ if (ret < 0)
+ err = ret;
+ }
}
+ list_splice_init(&list, &cache->pending[level]);
}
- BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
return err;
}
static void mark_block_processed(struct reloc_control *rc,
- struct backref_node *node)
+ u64 bytenr, u32 blocksize)
+{
+ set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+ EXTENT_DIRTY, GFP_NOFS);
+}
+
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node)
{
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
blocksize = btrfs_level_size(rc->extent_root, node->level);
- set_extent_bits(&rc->processed_blocks, node->bytenr,
- node->bytenr + blocksize - 1, EXTENT_DIRTY,
- GFP_NOFS);
+ mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
}
@@ -2179,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc,
if (next->processed)
break;
- mark_block_processed(rc, next);
+ __mark_block_processed(rc, next);
if (list_empty(&next->upper))
break;
@@ -2202,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
return 0;
}
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
- u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
- struct btrfs_key found_key;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
- u32 nritems;
- int i;
- int ret = 0;
-
- leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-
- nritems = btrfs_header_nritems(leaf);
- for (i = 0; i < nritems; i++) {
- cond_resched();
- btrfs_item_key_to_cpu(leaf, &found_key, i);
- if (found_key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
- bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (bytenr == 0)
- continue;
- if (in_block_group(bytenr, rc->block_group)) {
- ret = 1;
- break;
- }
- }
- free_extent_buffer(leaf);
- return ret;
-}
-
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct backref_node *node,
- struct rb_root *blocks)
-{
- struct tree_block *block;
- struct rb_node *rb_node;
- u64 bytenr;
- u64 ptr_gen;
- u32 blocksize;
- u32 nritems;
- int i;
- int err = 0;
-
- nritems = btrfs_header_nritems(node->eb);
- blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
- for (i = 0; i < nritems; i++) {
- cond_resched();
- bytenr = btrfs_node_blockptr(node->eb, i);
- ptr_gen = btrfs_node_ptr_generation(node->eb, i);
- if (ptr_gen == trans->transid)
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
- continue;
- if (tree_block_processed(bytenr, blocksize, rc))
- continue;
-
- readahead_tree_block(rc->extent_root,
- bytenr, blocksize, ptr_gen);
- }
-
- for (i = 0; i < nritems; i++) {
- cond_resched();
- bytenr = btrfs_node_blockptr(node->eb, i);
- ptr_gen = btrfs_node_ptr_generation(node->eb, i);
- if (ptr_gen == trans->transid)
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
- continue;
- if (tree_block_processed(bytenr, blocksize, rc))
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- !check_file_extents(rc, bytenr, blocksize, ptr_gen))
- continue;
-
- block = kmalloc(sizeof(*block), GFP_NOFS);
- if (!block) {
- err = -ENOMEM;
- break;
- }
- block->bytenr = bytenr;
- btrfs_node_key_to_cpu(node->eb, &block->key, i);
- block->level = node->level - 1;
- block->key_ready = 1;
- rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
- BUG_ON(rb_node);
- }
- if (err)
- free_block_list(blocks);
- return err;
-}
-
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct backref_cache *cache,
- struct rb_root *blocks, int level,
- struct backref_node **upper)
-{
- struct backref_node *node;
- int ret = 0;
-
- WARN_ON(!list_empty(&cache->pending[level]));
-
- if (list_empty(&cache->pending[level + 1]))
- return 1;
-
- node = list_entry(cache->pending[level + 1].next,
- struct backref_node, lower);
- if (node->eb)
- ret = add_child_blocks(trans, rc, node, blocks);
-
- *upper = node;
- return ret;
-}
-
static int get_tree_block_key(struct reloc_control *rc,
struct tree_block *block)
{
@@ -2371,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_root *root;
- int ret;
+ int release = 0;
+ int ret = 0;
+ if (!node)
+ return 0;
+
+ BUG_ON(node->processed);
root = select_one_root(trans, node);
- if (unlikely(!root)) {
- rc->found_old_snapshot = 1;
+ if (root == ERR_PTR(-ENOENT)) {
update_processed_blocks(rc, node);
- return 0;
+ goto out;
}
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- ret = do_relocation(trans, node, key, path, 1);
- if (ret < 0)
- goto out;
- if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
- ret = replace_file_extents(trans, rc, root,
- node->eb, NULL);
- if (ret < 0)
- goto out;
- }
- drop_node_buffer(node);
- } else if (!root->ref_cows) {
- path->lowest_level = node->level;
- ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- btrfs_release_path(root, path);
- if (ret < 0)
+ if (!root || root->ref_cows) {
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
goto out;
- } else if (root != node->root) {
- WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+ release = 1;
}
- update_processed_blocks(rc, node);
- ret = 0;
+ if (root) {
+ if (root->ref_cows) {
+ BUG_ON(node->new_bytenr);
+ BUG_ON(!list_empty(&node->list));
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+ node->new_bytenr = root->node->start;
+ node->root = root;
+ list_add_tail(&node->list, &rc->backref_cache.changed);
+ } else {
+ path->lowest_level = node->level;
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ btrfs_release_path(root, path);
+ if (ret > 0)
+ ret = 0;
+ }
+ if (!ret)
+ update_processed_blocks(rc, node);
+ } else {
+ ret = do_relocation(trans, rc, node, key, path, 1);
+ }
out:
- drop_node_buffer(node);
+ if (ret || node->level == 0 || node->cowonly) {
+ if (release)
+ release_metadata_space(rc, node);
+ remove_backref_node(&rc->backref_cache, node);
+ }
return ret;
}
@@ -2415,12 +2752,10 @@ static noinline_for_stack
int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct rb_root *blocks)
{
- struct backref_cache *cache;
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
struct rb_node *rb_node;
- int level = -1;
int ret;
int err = 0;
@@ -2428,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- cache = kmalloc(sizeof(*cache), GFP_NOFS);
- if (!cache) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
-
- backref_cache_init(cache);
-
rb_node = rb_first(blocks);
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
- if (level == -1)
- level = block->level;
- else
- BUG_ON(level != block->level);
if (!block->key_ready)
reada_tree_block(rc, block);
rb_node = rb_next(rb_node);
@@ -2460,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
- node = build_backref_tree(rc, cache, &block->key,
+ node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
err = PTR_ERR(node);
@@ -2470,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- err = ret;
+ if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ err = ret;
goto out;
}
- remove_backref_node(cache, node);
rb_node = rb_next(rb_node);
}
-
- if (level > 0)
- goto out;
-
+out:
free_block_list(blocks);
+ err = finish_pending_nodes(trans, rc, path, err);
- /*
- * now backrefs of some upper level tree blocks have been cached,
- * try relocating blocks referenced by these upper level blocks.
- */
- while (1) {
- struct backref_node *upper = NULL;
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing)
- break;
+ btrfs_free_path(path);
+ return err;
+}
- ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
- &upper);
- if (ret < 0)
- err = ret;
- if (ret != 0)
- break;
+static noinline_for_stack
+int prealloc_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 alloc_hint = 0;
+ u64 start;
+ u64 end;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 num_bytes;
+ int nr = 0;
+ int ret = 0;
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing)
- goto out;
- BUG_ON(!block->key_ready);
- node = build_backref_tree(rc, cache, &block->key,
- level, block->bytenr);
- if (IS_ERR(node)) {
- err = PTR_ERR(node);
- goto out;
- }
+ BUG_ON(cluster->start != cluster->boundary[0]);
+ mutex_lock(&inode->i_mutex);
- ret = relocate_tree_block(trans, rc, node,
- &block->key, path);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- remove_backref_node(cache, node);
- rb_node = rb_next(rb_node);
- }
- free_block_list(blocks);
+ ret = btrfs_check_data_free_space(inode, cluster->end +
+ 1 - cluster->start);
+ if (ret)
+ goto out;
- if (upper) {
- ret = link_to_upper(trans, upper, path);
- if (ret < 0) {
- err = ret;
- break;
- }
- remove_backref_node(cache, upper);
- }
+ while (nr < cluster->nr) {
+ start = cluster->boundary[nr] - offset;
+ if (nr + 1 < cluster->nr)
+ end = cluster->boundary[nr + 1] - 1 - offset;
+ else
+ end = cluster->end - offset;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ num_bytes = end + 1 - start;
+ ret = btrfs_prealloc_file_range(inode, 0, start,
+ num_bytes, num_bytes,
+ end + 1, &alloc_hint);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ if (ret)
+ break;
+ nr++;
}
+ btrfs_free_reserved_data_space(inode, cluster->end +
+ 1 - cluster->start);
out:
- free_block_list(blocks);
-
- ret = finish_pending_nodes(trans, cache, path);
- if (ret < 0)
- err = ret;
-
- kfree(cache);
- btrfs_free_path(path);
- return err;
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
static noinline_for_stack
@@ -2588,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
- unsigned int dirty_page = 0;
struct page *page;
struct file_ra_state *ra;
int nr = 0;
@@ -2601,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
- last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+ ret = prealloc_file_extent_cluster(inode, cluster);
+ if (ret)
+ goto out;
- mutex_lock(&inode->i_mutex);
+ file_ra_state_init(ra, inode->i_mapping);
- i_size_write(inode, cluster->end + 1 - offset);
ret = setup_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
- goto out_unlock;
-
- file_ra_state_init(ra, inode->i_mapping);
+ goto out;
- WARN_ON(cluster->start != cluster->boundary[0]);
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
while (index <= last_index) {
+ ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
page = find_lock_page(inode->i_mapping, index);
if (!page) {
page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
last_index + 1 - index);
page = grab_cache_page(inode->i_mapping, index);
if (!page) {
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
- goto out_unlock;
+ goto out;
}
}
@@ -2640,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
ret = -EIO;
- goto out_unlock;
+ goto out;
}
}
@@ -2660,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
EXTENT_BOUNDARY, GFP_NOFS);
nr++;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+ btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
set_page_dirty(page);
- dirty_page++;
unlock_extent(&BTRFS_I(inode)->io_tree,
page_start, page_end, GFP_NOFS);
@@ -2671,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_cache_release(page);
index++;
- if (nr < cluster->nr &&
- page_end + 1 + offset == cluster->boundary[nr]) {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_page);
- dirty_page = 0;
- }
- }
- if (dirty_page) {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_page);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(BTRFS_I(inode)->root);
}
WARN_ON(nr != cluster->nr);
-out_unlock:
- mutex_unlock(&inode->i_mutex);
+out:
kfree(ra);
return ret;
}
@@ -2870,9 +3172,6 @@ out:
static int block_use_full_backref(struct reloc_control *rc,
struct extent_buffer *eb)
{
- struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct btrfs_key key;
u64 flags;
int ret;
@@ -2880,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc,
btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
return 1;
- path = btrfs_alloc_path();
- BUG_ON(!path);
-
- key.objectid = eb->start;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = eb->len;
-
- path->search_commit_root = 1;
- path->skip_locking = 1;
- ret = btrfs_search_slot(NULL, rc->extent_root,
- &key, path, 0, 0);
+ ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+ eb->start, eb->len, NULL, &flags);
BUG_ON(ret);
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(path->nodes[0], ei);
- BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
ret = 1;
else
ret = 0;
- btrfs_free_path(path);
return ret;
}
@@ -3074,22 +3359,10 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize;
+ u32 blocksize = btrfs_level_size(rc->extent_root, 0);
int ret;
int err = 0;
- ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
- extent_key->offset);
- BUG_ON(ret < 0);
- if (ret > 0) {
- /* the relocated data is fragmented */
- rc->extents_skipped++;
- btrfs_release_path(rc->extent_root, path);
- return 0;
- }
-
- blocksize = btrfs_level_size(rc->extent_root, 0);
-
eb = path->nodes[0];
ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3443,8 @@ int add_data_references(struct reloc_control *rc,
*/
static noinline_for_stack
int find_next_extent(struct btrfs_trans_handle *trans,
- struct reloc_control *rc, struct btrfs_path *path)
+ struct reloc_control *rc, struct btrfs_path *path,
+ struct btrfs_key *extent_key)
{
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -3225,6 +3499,7 @@ next:
rc->search_start = end + 1;
} else {
rc->search_start = key.objectid + key.offset;
+ memcpy(extent_key, &key, sizeof(key));
return 0;
}
}
@@ -3262,12 +3537,49 @@ static int check_extent_flags(u64 flags)
return 0;
}
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+ if (!rc->block_rsv)
+ return -ENOMEM;
+
+ /*
+ * reserve some space for creating reloc trees.
+ * btrfs_init_reloc_root will use them when there
+ * is no reservation in transaction handle.
+ */
+ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+ rc->extent_root->nodesize * 256,
+ &rc->block_rsv_retries);
+ if (ret)
+ return ret;
+
+ rc->block_rsv->refill_used = 1;
+ btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+
+ memset(&rc->cluster, 0, sizeof(rc->cluster));
+ rc->search_start = rc->block_group->key.objectid;
+ rc->extents_found = 0;
+ rc->nodes_relocated = 0;
+ rc->merging_rsv_size = 0;
+ rc->block_rsv_retries = 0;
+
+ rc->create_reloc_tree = 1;
+ set_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+ return 0;
+}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
- struct file_extent_cluster *cluster;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -3277,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
int ret;
int err = 0;
- cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
- if (!cluster)
- return -ENOMEM;
-
path = btrfs_alloc_path();
- if (!path) {
- kfree(cluster);
+ if (!path)
return -ENOMEM;
- }
-
- rc->extents_found = 0;
- rc->extents_skipped = 0;
-
- rc->search_start = rc->block_group->key.objectid;
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
- GFP_NOFS);
-
- rc->create_reloc_root = 1;
- set_reloc_control(rc);
- trans = btrfs_start_transaction(rc->extent_root, 1);
- btrfs_commit_transaction(trans, rc->extent_root);
+ ret = prepare_to_relocate(rc);
+ if (ret) {
+ err = ret;
+ goto out_free;
+ }
while (1) {
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_start_transaction(rc->extent_root, 0);
+
+ if (update_backref_cache(trans, &rc->backref_cache)) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ continue;
+ }
- ret = find_next_extent(trans, rc, path);
+ ret = find_next_extent(trans, rc, path, &key);
if (ret < 0)
err = ret;
if (ret != 0)
@@ -3313,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- item_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
if (item_size >= sizeof(*ei)) {
flags = btrfs_extent_flags(path->nodes[0], ei);
ret = check_extent_flags(flags);
@@ -3356,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
} else if (rc->stage == UPDATE_DATA_PTRS &&
- (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
ret = add_data_references(rc, &key, path, &blocks);
} else {
btrfs_release_path(rc->extent_root, path);
ret = 0;
}
if (ret < 0) {
- err = 0;
+ err = ret;
break;
}
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
+ if (ret != -EAGAIN) {
+ err = ret;
+ break;
+ }
+ rc->extents_found--;
+ rc->search_start = key.objectid;
+ }
+ }
+
+ ret = btrfs_block_rsv_check(trans, rc->extent_root,
+ rc->block_rsv, 0, 5);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
err = ret;
+ WARN_ON(1);
break;
}
+ rc->commit_transaction = 1;
}
- nr = trans->blocks_used;
- btrfs_end_transaction(trans, rc->extent_root);
+ if (rc->commit_transaction) {
+ rc->commit_transaction = 0;
+ ret = btrfs_commit_transaction(trans, rc->extent_root);
+ BUG_ON(ret);
+ } else {
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_btree_balance_dirty(rc->extent_root, nr);
+ }
trans = NULL;
- btrfs_btree_balance_dirty(rc->extent_root, nr);
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
ret = relocate_data_extent(rc->data_inode,
- &key, cluster);
+ &key, &rc->cluster);
if (ret < 0) {
err = ret;
break;
}
}
}
- btrfs_free_path(path);
+
+ btrfs_release_path(rc->extent_root, path);
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ GFP_NOFS);
if (trans) {
nr = trans->blocks_used;
- btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
btrfs_btree_balance_dirty(rc->extent_root, nr);
}
if (!err) {
- ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+ ret = relocate_file_extent_cluster(rc->data_inode,
+ &rc->cluster);
if (ret < 0)
err = ret;
}
- kfree(cluster);
+ rc->create_reloc_tree = 0;
+ set_reloc_control(rc);
- rc->create_reloc_root = 0;
- smp_mb();
+ backref_cache_cleanup(&rc->backref_cache);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
- if (rc->extents_found > 0) {
- trans = btrfs_start_transaction(rc->extent_root, 1);
- btrfs_commit_transaction(trans, rc->extent_root);
- }
+ err = prepare_to_merge(rc, err);
merge_reloc_roots(rc);
+ rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
/* get rid of pinned extents */
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
-
+out_free:
+ btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+ btrfs_free_path(path);
return err;
}
@@ -3448,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(root, path);
out:
@@ -3460,8 +3790,9 @@ out:
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
*/
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -3475,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (IS_ERR(root))
return ERR_CAST(root);
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans))
+ return ERR_CAST(trans);
err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
if (err)
@@ -3496,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
out:
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
-
btrfs_btree_balance_dirty(root, nr);
if (err) {
if (inode)
@@ -3506,6 +3837,21 @@ out:
return inode;
}
+static struct reloc_control *alloc_reloc_control(void)
+{
+ struct reloc_control *rc;
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return NULL;
+
+ INIT_LIST_HEAD(&rc->reloc_roots);
+ backref_cache_init(&rc->backref_cache);
+ mapping_tree_init(&rc->reloc_root_tree);
+ extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+ return rc;
+}
+
/*
* function to relocate all extents in a block group.
*/
@@ -3514,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
struct btrfs_fs_info *fs_info = extent_root->fs_info;
struct reloc_control *rc;
int ret;
+ int rw = 0;
int err = 0;
- rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ rc = alloc_reloc_control();
if (!rc)
return -ENOMEM;
- mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
- INIT_LIST_HEAD(&rc->reloc_roots);
+ rc->extent_root = extent_root;
rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!rc->block_group);
- btrfs_init_workers(&rc->workers, "relocate",
- fs_info->thread_pool_size, NULL);
-
- rc->extent_root = extent_root;
- btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+ if (!rc->block_group->ro) {
+ ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ rw = 1;
+ }
rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
while (1) {
- rc->extents_found = 0;
- rc->extents_skipped = 0;
-
mutex_lock(&fs_info->cleaner_mutex);
btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
- break;
+ goto out;
}
if (rc->extents_found == 0)
@@ -3573,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
- } else if (rc->stage == UPDATE_DATA_PTRS &&
- rc->extents_skipped >= rc->extents_found) {
- iput(rc->data_inode);
- rc->data_inode = create_reloc_inode(fs_info,
- rc->block_group);
- if (IS_ERR(rc->data_inode)) {
- err = PTR_ERR(rc->data_inode);
- rc->data_inode = NULL;
- break;
- }
- rc->stage = MOVE_DATA_EXTENTS;
- rc->found_file_extent = 0;
}
}
@@ -3597,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(rc->block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
+ if (err && rw)
+ btrfs_set_block_group_rw(extent_root, rc->block_group);
iput(rc->data_inode);
- btrfs_stop_workers(&rc->workers);
btrfs_put_block_group(rc->block_group);
kfree(rc);
return err;
@@ -3609,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
@@ -3702,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ rc = alloc_reloc_control();
if (!rc) {
err = -ENOMEM;
goto out;
}
- mapping_tree_init(&rc->reloc_root_tree);
- INIT_LIST_HEAD(&rc->reloc_roots);
- btrfs_init_workers(&rc->workers, "relocate",
- root->fs_info->thread_pool_size, NULL);
rc->extent_root = root->fs_info->extent_root;
set_reloc_control(rc);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+
+ rc->merge_reloc_tree = 1;
+
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
@@ -3735,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root->reloc_root = reloc_root;
}
- trans = btrfs_start_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
merge_reloc_roots(rc);
unset_reloc_control(rc);
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
out:
- if (rc) {
- btrfs_stop_workers(&rc->workers);
- kfree(rc);
- }
+ kfree(rc);
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
@@ -3814,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
btrfs_put_ordered_extent(ordered);
return 0;
}
+
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow)
+{
+ struct reloc_control *rc;
+ struct backref_node *node;
+ int first_cow = 0;
+ int level;
+ int ret;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc)
+ return;
+
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+ root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+
+ level = btrfs_header_level(buf);
+ if (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ first_cow = 1;
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ rc->create_reloc_tree) {
+ WARN_ON(!first_cow && level == 0);
+
+ node = rc->backref_cache.path[level];
+ BUG_ON(node->bytenr != buf->start &&
+ node->new_bytenr != buf->start);
+
+ drop_node_buffer(node);
+ extent_buffer_get(cow);
+ node->eb = cow;
+ node->new_bytenr = cow->start;
+
+ if (!node->pending) {
+ list_move_tail(&node->list,
+ &rc->backref_cache.pending[level]);
+ node->pending = 1;
+ }
+
+ if (first_cow)
+ __mark_block_processed(rc, node);
+
+ if (first_cow && level > 0)
+ rc->nodes_relocated += buf->len;
+ }
+
+ if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+ ret = replace_file_extents(trans, rc, root, cow);
+ BUG_ON(ret);
+ }
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root;
+ struct reloc_control *rc;
+
+ root = pending->root;
+ if (!root->reloc_root)
+ return;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc->merge_reloc_tree)
+ return;
+
+ root = root->reloc_root;
+ BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+ /*
+ * relocation is in the stage of merging trees. the space
+ * used by merging a reloc tree is twice the size of
+ * relocated tree nodes in the worst case. half for cowing
+ * the reloc tree, half for cowing the fs tree. the space
+ * used by cowing the reloc tree will be freed after the
+ * tree is dropped. if we create snapshot, cowing the fs
+ * tree may use more space than it frees. so we need
+ * reserve extra space.
+ */
+ *bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *new_root;
+ struct reloc_control *rc;
+ int ret;
+
+ if (!root->reloc_root)
+ return;
+
+ rc = root->fs_info->reloc_ctl;
+ rc->merging_rsv_size += rc->nodes_relocated;
+
+ if (rc->merge_reloc_tree) {
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ rc->block_rsv,
+ rc->nodes_relocated);
+ BUG_ON(ret);
+ }
+
+ new_root = pending->snap;
+ reloc_root = create_reloc_root(trans, root->reloc_root,
+ new_root->root_key.objectid);
+
+ __add_reloc_root(reloc_root);
+ new_root->reloc_root = reloc_root;
+
+ if (rc->create_reloc_tree) {
+ ret = clone_backref_node(trans, rc, root, reloc_root);
+ BUG_ON(ret);
+ }
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d66..b91ccd97264 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
+ struct btrfs_key root_key;
+ struct btrfs_root *root;
int err = 0;
int ret;
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+
while (1) {
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_find_dead_roots(tree_root, key.offset);
- if (ret) {
+ root_key.objectid = key.offset;
+ key.offset++;
+
+ root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+ &root_key);
+ if (!IS_ERR(root))
+ continue;
+
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT) {
err = ret;
break;
}
- key.offset++;
+ ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+ if (ret) {
+ err = ret;
+ break;
+ }
}
btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2909a03e523..d34b2dfc962 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -498,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
btrfs_start_delalloc_inodes(root, 0);
btrfs_wait_ordered_extents(root, 0, 0);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_commit_transaction(trans, root);
return ret;
}
@@ -694,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
return -EINVAL;
- /* recover relocation */
- ret = btrfs_recover_relocation(root);
+ ret = btrfs_cleanup_fs_roots(root->fs_info);
WARN_ON(ret);
- ret = btrfs_cleanup_fs_roots(root->fs_info);
+ /* recover relocation */
+ ret = btrfs_recover_relocation(root);
WARN_ON(ret);
sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
- u64 data_used = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
- BTRFS_BLOCK_GROUP_RAID10|
- BTRFS_BLOCK_GROUP_RAID1)) {
- total_used += found->bytes_used;
- if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used += found->bytes_used;
- else
- data_used += found->total_bytes;
- }
-
- total_used += found->bytes_used;
- if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used += found->bytes_used;
- else
- data_used += found->total_bytes;
- }
+ list_for_each_entry_rcu(found, head, list)
+ total_used += found->disk_used;
rcu_read_unlock();
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bavail = buf->f_blocks - (data_used >> bits);
+ buf->f_bavail = buf->f_bfree;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b9..66e4c66cc63 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
TRANS_USERSPACE,
};
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+ if (!root->fs_info->log_root_recovering &&
+ ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+ type == TRANS_USERSPACE))
+ return 1;
+ return 0;
+}
+
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
- int num_blocks, int type)
+ u64 num_items, int type)
{
- struct btrfs_trans_handle *h =
- kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ struct btrfs_trans_handle *h;
+ struct btrfs_transaction *cur_trans;
+ int retries = 0;
int ret;
+again:
+ h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ if (!h)
+ return ERR_PTR(-ENOMEM);
mutex_lock(&root->fs_info->trans_mutex);
- if (!root->fs_info->log_root_recovering &&
- ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
- type == TRANS_USERSPACE))
+ if (may_wait_transaction(root, type))
wait_current_trans(root);
+
ret = join_transaction(root);
BUG_ON(ret);
- h->transid = root->fs_info->running_transaction->transid;
- h->transaction = root->fs_info->running_transaction;
- h->blocks_reserved = num_blocks;
+ cur_trans = root->fs_info->running_transaction;
+ cur_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ h->transid = cur_trans->transid;
+ h->transaction = cur_trans;
h->blocks_used = 0;
h->block_group = 0;
- h->alloc_exclude_nr = 0;
- h->alloc_exclude_start = 0;
+ h->bytes_reserved = 0;
h->delayed_ref_updates = 0;
+ h->block_rsv = NULL;
- if (!current->journal_info && type != TRANS_USERSPACE)
- current->journal_info = h;
+ smp_mb();
+ if (cur_trans->blocked && may_wait_transaction(root, type)) {
+ btrfs_commit_transaction(h, root);
+ goto again;
+ }
+
+ if (num_items > 0) {
+ ret = btrfs_trans_reserve_metadata(h, root, num_items,
+ &retries);
+ if (ret == -EAGAIN) {
+ btrfs_commit_transaction(h, root);
+ goto again;
+ }
+ if (ret < 0) {
+ btrfs_end_transaction(h, root);
+ return ERR_PTR(ret);
+ }
+ }
- root->fs_info->running_transaction->use_count++;
+ mutex_lock(&root->fs_info->trans_mutex);
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
+
+ if (!current->journal_info && type != TRANS_USERSPACE)
+ current->journal_info = h;
return h;
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_blocks)
+ int num_items)
{
- return start_transaction(root, num_blocks, TRANS_START);
+ return start_transaction(root, num_items, TRANS_START);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
int num_blocks)
{
- return start_transaction(root, num_blocks, TRANS_JOIN);
+ return start_transaction(root, 0, TRANS_JOIN);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int num_blocks)
{
- return start_transaction(r, num_blocks, TRANS_USERSPACE);
+ return start_transaction(r, 0, TRANS_USERSPACE);
}
/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
mutex_unlock(&root->fs_info->trans_mutex);
}
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ ret = btrfs_block_rsv_check(trans, root,
+ &root->fs_info->global_block_rsv, 0, 5);
+ return ret ? 1 : 0;
+}
+
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int updates;
+
+ if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+ return 1;
+
+ updates = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (updates)
+ btrfs_run_delayed_refs(trans, root, updates);
+
+ return should_end_transaction(trans, root);
+}
+
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int throttle)
{
- struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
int count = 0;
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
count++;
}
+ btrfs_trans_release_metadata(trans, root);
+
+ if (!root->fs_info->open_ioctl_trans &&
+ should_end_transaction(trans, root))
+ trans->transaction->blocked = 1;
+
+ if (cur_trans->blocked && !cur_trans->in_commit) {
+ if (throttle)
+ return btrfs_commit_transaction(trans, root);
+ else
+ wake_up_process(info->transaction_kthread);
+ }
+
mutex_lock(&info->trans_mutex);
- cur_trans = info->running_transaction;
- WARN_ON(cur_trans != trans->transaction);
+ WARN_ON(cur_trans != info->running_transaction);
WARN_ON(cur_trans->num_writers < 1);
cur_trans->num_writers--;
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
+ btrfs_orphan_commit_root(trans, root);
if (root->commit_root != root->node) {
switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
struct btrfs_fs_info *info = root->fs_info;
- int ret;
struct btrfs_trans_handle *trans;
+ int ret;
unsigned long nr;
- smp_mb();
- if (root->defrag_running)
+ if (xchg(&root->defrag_running, 1))
return 0;
- trans = btrfs_start_transaction(root, 1);
+
while (1) {
- root->defrag_running = 1;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
ret = btrfs_defrag_leaves(trans, root, cacheonly);
+
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(info->tree_root, nr);
cond_resched();
- trans = btrfs_start_transaction(root, 1);
if (root->fs_info->closing || ret != -EAGAIN)
break;
}
root->defrag_running = 0;
- smp_mb();
- btrfs_end_transaction(trans, root);
- return 0;
+ return ret;
}
#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct inode *parent_inode;
+ struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
int ret;
- u64 objectid;
- int namelen;
+ int retries = 0;
+ u64 to_reserve = 0;
u64 index = 0;
-
- parent_inode = pending->dentry->d_parent->d_inode;
- parent_root = BTRFS_I(parent_inode)->root;
+ u64 objectid;
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
- ret = -ENOMEM;
+ pending->error = -ENOMEM;
goto fail;
}
+
ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
- if (ret)
+ if (ret) {
+ pending->error = ret;
goto fail;
+ }
+
+ btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+ btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+
+ if (to_reserve > 0) {
+ ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+ to_reserve, &retries);
+ if (ret) {
+ pending->error = ret;
+ goto fail;
+ }
+ }
key.objectid = objectid;
- /* record when the snapshot was created in key.offset */
- key.offset = trans->transid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+ key.type = BTRFS_ROOT_ITEM_KEY;
- memcpy(&pending->root_key, &key, sizeof(key));
- pending->root_key.offset = (u64)-1;
+ trans->block_rsv = &pending->block_rsv;
+ dentry = pending->dentry;
+ parent_inode = dentry->d_parent->d_inode;
+ parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root);
+
/*
* insert the directory item
*/
- namelen = strlen(pending->name);
ret = btrfs_set_inode_index(parent_inode, &index);
BUG_ON(ret);
ret = btrfs_insert_dir_item(trans, parent_root,
- pending->name, namelen,
- parent_inode->i_ino,
- &pending->root_key, BTRFS_FT_DIR, index);
+ dentry->d_name.name, dentry->d_name.len,
+ parent_inode->i_ino, &key,
+ BTRFS_FT_DIR, index);
BUG_ON(ret);
- btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ dentry->d_name.len * 2);
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
btrfs_set_root_node(new_root_item, tmp);
- ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
- new_root_item);
- BUG_ON(ret);
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
+ ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
+ BUG_ON(ret);
- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
- pending->root_key.objectid,
+ /*
+ * insert root back/forward references
+ */
+ ret = btrfs_add_root_ref(trans, tree_root, objectid,
parent_root->root_key.objectid,
- parent_inode->i_ino, index, pending->name,
- namelen);
+ parent_inode->i_ino, index,
+ dentry->d_name.name, dentry->d_name.len);
BUG_ON(ret);
+ key.offset = (u64)-1;
+ pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(IS_ERR(pending->snap));
+
+ btrfs_reloc_post_snapshot(trans, pending);
+ btrfs_orphan_post_snapshot(trans, pending);
fail:
kfree(new_root_item);
- return ret;
+ btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+ return 0;
}
/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
return ret;
}
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+ int ret = 0;
+ spin_lock(&info->new_trans_lock);
+ if (info->running_transaction)
+ ret = info->running_transaction->blocked;
+ spin_unlock(&info->new_trans_lock);
+ return ret;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
+ btrfs_trans_release_metadata(trans, root);
+
cur_trans = trans->transaction;
/*
* set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
snap_pending = 1;
WARN_ON(cur_trans != trans->transaction);
- prepare_to_wait(&cur_trans->writer_wait, &wait,
- TASK_UNINTERRUPTIBLE);
-
if (cur_trans->num_writers > 1)
timeout = MAX_SCHEDULE_TIMEOUT;
else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_run_ordered_operations(root, 1);
+ prepare_to_wait(&cur_trans->writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
smp_mb();
if (cur_trans->num_writers > 1 || should_grow)
schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- btrfs_drop_snapshot(root, 0);
+ btrfs_drop_snapshot(root, NULL, 0);
else
- btrfs_drop_snapshot(root, 1);
+ btrfs_drop_snapshot(root, NULL, 1);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb3311..e104986d0bf 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
+ u64 block_group;
+ u64 bytes_reserved;
unsigned long blocks_reserved;
unsigned long blocks_used;
- struct btrfs_transaction *transaction;
- u64 block_group;
- u64 alloc_exclude_start;
- u64 alloc_exclude_nr;
unsigned long delayed_ref_updates;
+ struct btrfs_transaction *transaction;
+ struct btrfs_block_rsv *block_rsv;
};
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct btrfs_root *root;
- char *name;
- struct btrfs_key root_key;
+ struct btrfs_root *snap;
+ /* block reservation for the operation */
+ struct btrfs_block_rsv block_rsv;
+ /* extra metadata reseration for relocation */
+ int error;
struct list_head list;
};
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_blocks);
+ int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
- int num_blocks);
+ int num_blocks);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
- int num_blocks);
+ int num_blocks);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_throttle(struct btrfs_root *root);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb162..f7ac8e013ed 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->nodes[1], 0,
cache_only, &last_ret,
&root->defrag_progress);
- WARN_ON(ret && ret != -EAGAIN);
+ if (ret) {
+ WARN_ON(ret == -EAGAIN);
+ goto out;
+ }
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
}
-
- btrfs_release_path(root, path);
out:
if (path)
btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d..fb102a9aee9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
+ int err = 0;
mutex_lock(&root->log_mutex);
if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->tree_log_mutex);
if (!root->fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, root->fs_info);
- BUG_ON(ret);
+ if (ret)
+ err = ret;
}
- if (!root->log_root) {
+ if (err == 0 && !root->log_root) {
ret = btrfs_add_log_tree(trans, root);
- BUG_ON(ret);
+ if (ret)
+ err = ret;
}
mutex_unlock(&root->fs_info->tree_log_mutex);
root->log_batch++;
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return 0;
+ return err;
}
/*
@@ -376,7 +379,7 @@ insert:
BUG_ON(ret);
}
} else if (ret) {
- BUG();
+ return ret;
}
dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
- wc->process_func(root, next, wc, ptr_gen);
-
if (*level == 1) {
+ wc->process_func(root, next, wc, ptr_gen);
+
path->slots[*level]++;
if (wc->free) {
btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
-
- bytenr = path->nodes[*level]->start;
-
- blocksize = btrfs_level_size(root, *level);
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
-
- wc->process_func(root, path->nodes[*level], wc,
- btrfs_header_generation(path->nodes[*level]));
-
- if (wc->free) {
- next = path->nodes[*level];
- btrfs_tree_lock(next);
- clean_tree_block(trans, root, next);
- btrfs_set_lock_blocking(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
-
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
- BUG_ON(ret);
- }
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
- *level += 1;
+ path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
slot = path->slots[i];
- if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+ if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
struct extent_buffer *node;
node = path->nodes[i];
path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
ret = update_log_root(trans, log);
- BUG_ON(ret);
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
wake_up(&log_root_tree->log_writer_wait);
}
+ if (ret) {
+ BUG_ON(ret != -ENOSPC);
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out;
+ }
+
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
return 0;
}
-/*
- * free all the extents used by the tree log. This should be called
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+static void free_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
{
int ret;
- struct btrfs_root *log;
- struct key;
u64 start;
u64 end;
struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
.process_func = process_one_buffer
};
- if (!root->log_root || root->fs_info->log_root_recovering)
- return 0;
-
- log = root->log_root;
ret = walk_log_tree(trans, log, &wc);
BUG_ON(ret);
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
}
- if (log->log_transid > 0) {
- ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
- &log->root_key);
- BUG_ON(ret);
- }
- root->log_root = NULL;
free_extent_buffer(log->node);
kfree(log);
+}
+
+/*
+ * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+ if (root->log_root) {
+ free_log_tree(trans, root->log_root);
+ root->log_root = NULL;
+ }
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->log_root_tree) {
+ free_log_tree(trans, fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ }
return 0;
}
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
+ int err = 0;
int bytes_del = 0;
if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
name, name_len, -1);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
index, name, name_len, -1);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
if (ret == 0) {
struct btrfs_inode_item *item;
u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
ret = 0;
btrfs_release_path(log, path);
}
-
+fail:
btrfs_free_path(path);
mutex_unlock(&BTRFS_I(dir)->log_mutex);
+ if (ret == -ENOSPC) {
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 0;
+ }
btrfs_end_log_trans(root);
return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ if (ret == -ENOSPC) {
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 0;
+ }
btrfs_end_log_trans(root);
return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
else
key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- BUG_ON(ret);
+ if (ret)
+ return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src;
+ int err = 0;
int ret;
int i;
int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
}
}
btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
goto done;
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
}
path->slots[0] = nritems;
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
-
- BUG_ON(ret);
- last_offset = tmp.offset;
+ if (ret)
+ err = ret;
+ else
+ last_offset = tmp.offset;
goto done;
}
}
done:
- *last_offset_ret = last_offset;
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
- /* insert the log range keys to indicate where the log is valid */
- ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
- first_offset, last_offset);
- BUG_ON(ret);
- return 0;
+ if (err == 0) {
+ *last_offset_ret = last_offset;
+ /*
+ * insert the log range keys to indicate where the log
+ * is valid
+ */
+ ret = insert_dir_log_key(trans, log, path, key_type,
+ inode->i_ino, first_offset,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
+ return err;
}
/*
@@ -2501,7 +2529,8 @@ again:
ret = log_dir_items(trans, root, inode, path,
dst_path, key_type, min_key,
&max_key);
- BUG_ON(ret);
+ if (ret)
+ return ret;
if (max_key == (u64)-1)
break;
min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
-
- if (ret != 1)
+ BUG_ON(ret == 0);
+ if (ret < 0)
break;
if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
}
btrfs_release_path(log, path);
- return 0;
+ return ret;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
}
ret = btrfs_insert_empty_items(trans, log, dst_path,
ins_keys, ins_sizes, nr);
- BUG_ON(ret);
+ if (ret) {
+ kfree(ins_data);
+ return ret;
+ }
for (i = 0; i < nr; i++, dst_path->slots[0]++) {
dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
+ ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
- ret = btrfs_csum_file_blocks(trans, log, sums);
- BUG_ON(ret);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log, sums);
list_del(&sums->list);
kfree(sums);
}
- return 0;
+ return ret;
}
/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
u32 size;
+ int err = 0;
int ret;
int nritems;
int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
}
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
path->keep_locks = 1;
while (1) {
@@ -2768,7 +2805,10 @@ again:
ret = copy_items(trans, log, dst_path, src, ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
ret = copy_items(trans, log, dst_path, src,
ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 0;
}
btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
ret = copy_items(trans, log, dst_path, src,
ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 0;
}
WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
ret = log_directory_changes(trans, root, inode, path, dst_path);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
}
BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
btrfs_free_path(dst_path);
- return 0;
+ return err;
}
/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- start_log_trans(trans, root);
+ ret = start_log_trans(trans, root);
+ if (ret)
+ goto end_trans;
ret = btrfs_log_inode(trans, root, inode, inode_only);
- BUG_ON(ret);
+ if (ret)
+ goto end_trans;
/*
* for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
*/
if (S_ISREG(inode->i_mode) &&
BTRFS_I(inode)->generation <= last_committed &&
- BTRFS_I(inode)->last_unlink_trans <= last_committed)
- goto no_parent;
+ BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+ ret = 0;
+ goto end_trans;
+ }
inode_only = LOG_INODE_EXISTS;
while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only);
- BUG_ON(ret);
+ if (ret)
+ goto end_trans;
}
if (IS_ROOT(parent))
break;
parent = parent->d_parent;
}
-no_parent:
ret = 0;
+end_trans:
+ if (ret < 0) {
+ BUG_ON(ret != -ENOSPC);
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 1;
+ }
btrfs_end_log_trans(root);
end_no_trans:
return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
path = btrfs_alloc_path();
BUG_ON(!path);
- trans = btrfs_start_transaction(fs_info->tree_root, 1);
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
wc.trans = trans;
wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb508..3dfae84c8cc 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae..d6e3af8be95 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
lock_chunks(root);
device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- BUG_ON(ret);
+ if (ret)
+ return ret;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
BUG_ON(!trans);
lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
break;
BUG_ON(ret);
- trans = btrfs_start_transaction(dev_root, 1);
+ trans = btrfs_start_transaction(dev_root, 0);
BUG_ON(!trans);
ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
}
/* Shrinking succeeded, else we would be at "done". */
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto done;
- }
+ trans = btrfs_start_transaction(root, 0);
lock_chunks(root);
device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288..88ecbb21587 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
if (trans)
return do_setxattr(trans, inode, name, value, size, flags);
- ret = btrfs_reserve_metadata_space(root, 2);
- if (ret)
- return ret;
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto out;
- }
btrfs_set_trans_block_group(trans, inode);
ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
BUG_ON(ret);
out:
btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 2);
return ret;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb..da111aacb46 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
int reap_counter; /* rate limit reaping */
get_block_t *get_block; /* block mapping function */
dio_iodone_t *end_io; /* IO completion function */
+ dio_submit_t *submit_io; /* IO submition function */
+ loff_t logical_offset_in_bio; /* current first logical block in bio */
sector_t final_block_in_bio; /* current final block in bio + 1 */
sector_t next_block_for_io; /* next block to be put under IO,
in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
unsigned cur_page_offset; /* Offset into it, in bytes */
unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
sector_t cur_page_block; /* Where it starts */
+ loff_t cur_page_fs_offset; /* Offset in file */
/* BIO completion state */
spinlock_t bio_lock; /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+ struct dio *dio = bio->bi_private;
+
+ if (dio->is_async)
+ dio_bio_end_aio(bio, error);
+ else
+ dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
+
static int
dio_bio_alloc(struct dio *dio, struct block_device *bdev,
sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
bio->bi_end_io = dio_bio_end_io;
dio->bio = bio;
+ dio->logical_offset_in_bio = dio->cur_page_fs_offset;
return 0;
}
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
if (dio->is_async && dio->rw == READ)
bio_set_pages_dirty(bio);
- submit_bio(dio->rw, bio);
+ if (dio->submit_io)
+ dio->submit_io(dio->rw, bio, dio->inode,
+ dio->logical_offset_in_bio);
+ else
+ submit_bio(dio->rw, bio);
dio->bio = NULL;
dio->boundary = 0;
+ dio->logical_offset_in_bio = 0;
}
/*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
int ret = 0;
if (dio->bio) {
+ loff_t cur_offset = dio->block_in_file << dio->blkbits;
+ loff_t bio_next_offset = dio->logical_offset_in_bio +
+ dio->bio->bi_size;
+
/*
- * See whether this new request is contiguous with the old
+ * See whether this new request is contiguous with the old.
+ *
+ * Btrfs cannot handl having logically non-contiguous requests
+ * submitted. For exmple if you have
+ *
+ * Logical: [0-4095][HOLE][8192-12287]
+ * Phyiscal: [0-4095] [4096-8181]
+ *
+ * We cannot submit those pages together as one BIO. So if our
+ * current logical offset in the file does not equal what would
+ * be the next logical offset in the bio, submit the bio we
+ * have.
*/
- if (dio->final_block_in_bio != dio->cur_page_block)
+ if (dio->final_block_in_bio != dio->cur_page_block ||
+ cur_offset != bio_next_offset)
dio_bio_submit(dio);
/*
* Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
dio->cur_page_offset = offset;
dio->cur_page_len = len;
dio->cur_page_block = blocknr;
+ dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
out:
return ret;
}
@@ -935,7 +981,7 @@ static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
- struct dio *dio)
+ dio_submit_t submit_io, struct dio *dio)
{
unsigned long user_addr;
unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
dio->get_block = get_block;
dio->end_io = end_io;
+ dio->submit_io = submit_io;
dio->final_block_in_bio = -1;
dio->next_block_for_io = -1;
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
}
} /* end iovec loop */
- if (ret == -ENOTBLK && (rw & WRITE)) {
+ if (ret == -ENOTBLK) {
/*
* The remaining part of the request will be
* be handled by buffered I/O when we return
@@ -1110,7 +1157,7 @@ ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int flags)
+ dio_submit_t submit_io, int flags)
{
int seg;
size_t size;
@@ -1197,7 +1244,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
(end > i_size_read(inode)));
retval = direct_io_worker(rw, iocb, inode, iov, offset,
- nr_segs, blkbits, get_block, end_io, dio);
+ nr_segs, blkbits, get_block, end_io,
+ submit_io, dio);
/*
* In case of error extending write may have instantiated a few
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9682d52d150..85e823adcd4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2251,10 +2251,15 @@ static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
#endif
#ifdef CONFIG_BLOCK
+struct bio;
+typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
+ loff_t file_offset);
+void dio_end_io(struct bio *bio, int error);
+
ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int lock_type);
+ dio_submit_t submit_io, int lock_type);
enum {
/* need locking between buffered and direct access */
@@ -2270,7 +2275,7 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io,
+ nr_segs, get_block, end_io, NULL,
DIO_LOCKING | DIO_SKIP_HOLES);
}
@@ -2280,7 +2285,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
dio_iodone_t end_io)
{
return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io, 0);
+ nr_segs, get_block, end_io, NULL, 0);
}
#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 35e12d18656..45a2d18df84 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1275,7 +1275,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
- unsigned long seg;
+ unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
@@ -1302,21 +1302,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
}
- if (retval > 0)
+ if (retval > 0) {
*ppos = pos + retval;
- if (retval) {
+ count -= retval;
+ }
+
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !count || *ppos >= size) {
file_accessed(filp);
goto out;
}
}
}
+ count = retval;
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
+ loff_t offset = 0;
+
+ /*
+ * If we did a short DIO read we need to skip the section of the
+ * iov that we've already read data into.
+ */
+ if (count) {
+ if (count > iov[seg].iov_len) {
+ count -= iov[seg].iov_len;
+ continue;
+ }
+ offset = count;
+ count = 0;
+ }
desc.written = 0;
- desc.arg.buf = iov[seg].iov_base;
- desc.count = iov[seg].iov_len;
+ desc.arg.buf = iov[seg].iov_base + offset;
+ desc.count = iov[seg].iov_len - offset;
if (desc.count == 0)
continue;
desc.error = 0;