aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-30 09:05:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-30 09:05:48 -0700
commit925d169f5b86fe57e2f5264ea574cce9a89b719d (patch)
tree241d3156b427c6398bd3fc5efa9108635d0e189b
parentcdf01dd5443d0befc8c6a32cb2e3d2f568fd2558 (diff)
parent6418c96107a2b399848bb8cfc6e29f11ca74fb94 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (39 commits) Btrfs: deal with errors from updating the tree log Btrfs: allow subvol deletion by unprivileged user with -o user_subvol_rm_allowed Btrfs: make SNAP_DESTROY async Btrfs: add SNAP_CREATE_ASYNC ioctl Btrfs: add START_SYNC, WAIT_SYNC ioctls Btrfs: async transaction commit Btrfs: fix deadlock in btrfs_commit_transaction Btrfs: fix lockdep warning on clone ioctl Btrfs: fix clone ioctl where range is adjacent to extent Btrfs: fix delalloc checks in clone ioctl Btrfs: drop unused variable in block_alloc_rsv Btrfs: cleanup warnings from gcc 4.6 (nonbugs) Btrfs: Fix variables set but not read (bugs found by gcc 4.6) Btrfs: Use ERR_CAST helpers Btrfs: use memdup_user helpers Btrfs: fix raid code for removing missing drives Btrfs: Switch the extent buffer rbtree into a radix tree Btrfs: restructure try_release_extent_buffer() Btrfs: use the flusher threads for delalloc throttling Btrfs: tune the chunk allocation to 5% of the FS as metadata ... Fix up trivial conflicts in fs/btrfs/super.c and fs/fs-writeback.c, and remove use of INIT_RCU_HEAD in fs/btrfs/extent_io.c (that init macro was useless and removed in commit 5e8067adfdba: "rcu head remove init")
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c57
-rw-r--r--fs/btrfs/ctree.h100
-rw-r--r--fs/btrfs/dir-item.c2
-rw-r--r--fs/btrfs/disk-io.c32
-rw-r--r--fs/btrfs/extent-tree.c694
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/free-space-cache.c751
-rw-r--r--fs/btrfs/free-space-cache.h18
-rw-r--r--fs/btrfs/inode.c202
-rw-r--r--fs/btrfs/ioctl.c398
-rw-r--r--fs/btrfs/ioctl.h13
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/relocation.c109
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/super.c42
-rw-r--r--fs/btrfs/transaction.c234
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c17
-rw-r--r--fs/btrfs/volumes.c7
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c5
-rw-r--r--fs/fs-writeback.c47
-rw-r--r--include/linux/writeback.h2
27 files changed, 2405 insertions, 519 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a..7845d1f7d1d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -163,7 +163,6 @@ fail:
*/
static void end_compressed_bio_read(struct bio *bio, int err)
{
- struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
@@ -187,7 +186,6 @@ static void end_compressed_bio_read(struct bio *bio, int err)
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
- tree = &BTRFS_I(inode)->io_tree;
ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
cb->start,
cb->orig_bio->bi_io_vec,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc..9ac17159925 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid)
{
struct extent_buffer *cow;
- u32 nritems;
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
@@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(root->ref_cows && trans->transid != root->last_trans);
level = btrfs_header_level(buf);
- nritems = btrfs_header_nritems(buf);
if (level == 0)
btrfs_item_key(buf, &disk_key, 0);
else
@@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
int wret;
int pslot;
int orig_slot = path->slots[level];
- int err_on_enospc = 0;
u64 orig_ptr;
if (level == 0)
@@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
- if (btrfs_header_nritems(mid) < 2)
- err_on_enospc = 1;
+ btrfs_header_nritems(mid);
left = read_node_slot(root, parent, pslot - 1);
if (left) {
@@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
wret = push_node_left(trans, root, left, mid, 1);
if (wret < 0)
ret = wret;
- if (btrfs_header_nritems(mid) < 2)
- err_on_enospc = 1;
+ btrfs_header_nritems(mid);
}
/*
@@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
int wret;
int pslot;
int orig_slot = path->slots[level];
- u64 orig_ptr;
if (level == 0)
return 1;
mid = path->nodes[level];
WARN_ON(btrfs_header_generation(mid) != trans->transid);
- orig_ptr = btrfs_node_blockptr(mid, orig_slot);
if (level < BTRFS_MAX_LEVEL - 1)
parent = path->nodes[level + 1];
@@ -1577,13 +1570,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
blocksize = btrfs_level_size(root, level - 1);
tmp = btrfs_find_tree_block(root, blocknr, blocksize);
- if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
- /*
- * we found an up to date block without sleeping, return
- * right away
- */
- *eb_ret = tmp;
- return 0;
+ if (tmp) {
+ if (btrfs_buffer_uptodate(tmp, 0)) {
+ if (btrfs_buffer_uptodate(tmp, gen)) {
+ /*
+ * we found an up to date block without
+ * sleeping, return
+ * right away
+ */
+ *eb_ret = tmp;
+ return 0;
+ }
+ /* the pages were up to date, but we failed
+ * the generation number check. Do a full
+ * read for the generation number that is correct.
+ * We must do this without dropping locks so
+ * we can trust our generation number
+ */
+ free_extent_buffer(tmp);
+ tmp = read_tree_block(root, blocknr, blocksize, gen);
+ if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+ *eb_ret = tmp;
+ return 0;
+ }
+ free_extent_buffer(tmp);
+ btrfs_release_path(NULL, p);
+ return -EIO;
+ }
}
/*
@@ -1596,8 +1609,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_unlock_up_safe(p, level + 1);
btrfs_set_path_blocking(p);
- if (tmp)
- free_extent_buffer(tmp);
+ free_extent_buffer(tmp);
if (p->reada)
reada_for_search(root, p, level, slot, key->objectid);
@@ -2548,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
{
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
- int slot;
int i;
int push_space = 0;
int push_items = 0;
@@ -2560,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
u32 this_item_size;
u32 old_left_item_size;
- slot = path->slots[1];
-
if (empty)
nr = min(right_nritems, max_slot);
else
@@ -3330,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
{
int ret = 0;
int slot;
- int slot_orig;
struct extent_buffer *leaf;
struct btrfs_item *item;
u32 nritems;
@@ -3340,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int size_diff;
int i;
- slot_orig = path->slots[0];
leaf = path->nodes[0];
slot = path->slots[0];
@@ -3445,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
{
int ret = 0;
int slot;
- int slot_orig;
struct extent_buffer *leaf;
struct btrfs_item *item;
u32 nritems;
@@ -3454,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
int i;
- slot_orig = path->slots[0];
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -3787,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_key *cpu_key, u32 *data_size,
int nr)
{
- struct extent_buffer *leaf;
int ret = 0;
int slot;
int i;
@@ -3804,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
- leaf = path->nodes[0];
slot = path->slots[0];
BUG_ON(slot < 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eaf286abad1..8db9234f6b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -99,6 +99,9 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
@@ -265,6 +268,22 @@ struct btrfs_chunk {
/* additional stripes go here */
} __attribute__ ((__packed__));
+#define BTRFS_FREE_SPACE_EXTENT 1
+#define BTRFS_FREE_SPACE_BITMAP 2
+
+struct btrfs_free_space_entry {
+ __le64 offset;
+ __le64 bytes;
+ u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_free_space_header {
+ struct btrfs_disk_key location;
+ __le64 generation;
+ __le64 num_entries;
+ __le64 num_bitmaps;
+} __attribute__ ((__packed__));
+
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -365,8 +384,10 @@ struct btrfs_super_block {
char label[BTRFS_LABEL_SIZE];
+ __le64 cache_generation;
+
/* future expansion */
- __le64 reserved[32];
+ __le64 reserved[31];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
} __attribute__ ((__packed__));
@@ -375,13 +396,15 @@ struct btrfs_super_block {
* ones specified below then we will fail to mount
*/
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -675,7 +698,8 @@ struct btrfs_block_group_item {
struct btrfs_space_info {
u64 flags;
- u64 total_bytes; /* total bytes in the space */
+ u64 total_bytes; /* total bytes in the space,
+ this doesn't take mirrors into account */
u64 bytes_used; /* total bytes used,
this does't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
@@ -687,6 +711,8 @@ struct btrfs_space_info {
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 disk_used; /* total bytes used on disk */
+ u64 disk_total; /* total bytes on disk, takes mirrors into
+ account */
int full; /* indicates that we cannot allocate any more
chunks for this space */
@@ -750,6 +776,14 @@ enum btrfs_caching_type {
BTRFS_CACHE_FINISHED = 2,
};
+enum btrfs_disk_cache_state {
+ BTRFS_DC_WRITTEN = 0,
+ BTRFS_DC_ERROR = 1,
+ BTRFS_DC_CLEAR = 2,
+ BTRFS_DC_SETUP = 3,
+ BTRFS_DC_NEED_WRITE = 4,
+};
+
struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
@@ -763,6 +797,7 @@ struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
struct btrfs_fs_info *fs_info;
+ struct inode *inode;
spinlock_t lock;
u64 pinned;
u64 reserved;
@@ -773,8 +808,11 @@ struct btrfs_block_group_cache {
int extents_thresh;
int free_extents;
int total_bitmaps;
- int ro;
- int dirty;
+ int ro:1;
+ int dirty:1;
+ int iref:1;
+
+ int disk_cache_state;
/* cache tracking stuff */
int cached;
@@ -863,6 +901,7 @@ struct btrfs_fs_info {
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
wait_queue_head_t transaction_wait;
+ wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
struct btrfs_super_block super_copy;
@@ -949,6 +988,7 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_workers;
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
+ struct btrfs_workers endio_freespace_worker;
struct btrfs_workers submit_workers;
/*
* fixup workers take dirty pages that didn't properly go through
@@ -1192,6 +1232,9 @@ struct btrfs_root {
#define BTRFS_MOUNT_NOSSD (1 << 9)
#define BTRFS_MOUNT_DISCARD (1 << 10)
#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1665,6 +1708,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
write_eb_member(eb, item, struct btrfs_dir_item, location, key);
}
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+ num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+ num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+ generation, 64);
+
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+ struct btrfs_free_space_header *h,
+ struct btrfs_disk_key *key)
+{
+ read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+ struct btrfs_free_space_header *h,
+ struct btrfs_disk_key *key)
+{
+ write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+
/* struct btrfs_disk_key */
BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
objectid, 64);
@@ -1876,6 +1940,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
incompat_flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+ cache_generation, 64);
static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
{
@@ -1988,6 +2054,12 @@ static inline struct dentry *fdentry(struct file *file)
return file->f_path.dentry;
}
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+ return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
+
/* extent-tree.c */
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2079,7 +2151,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- int num_items, int *retries);
+ int num_items);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2100,7 +2172,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int *retries);
+ u64 num_bytes);
int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
@@ -2115,6 +2187,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
int btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2373,7 +2446,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+ int sync);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
@@ -2426,6 +2500,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa4..f0cad5ae5be 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
ret = btrfs_truncate_item(trans, root, path,
item_len - sub_item_len, 1);
}
- return 0;
+ return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5e789f4a3ed..fb827d0d718 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
struct extent_io_tree *tree;
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 found_start;
- int found_level;
unsigned long len;
struct extent_buffer *eb;
int ret;
@@ -369,8 +368,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
WARN_ON(1);
goto err;
}
- found_level = btrfs_header_level(eb);
-
csum_tree_block(root, eb, 0);
err:
free_extent_buffer(eb);
@@ -481,9 +478,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
end_io_wq->work.flags = 0;
if (bio->bi_rw & REQ_WRITE) {
- if (end_io_wq->metadata)
+ if (end_io_wq->metadata == 1)
btrfs_queue_worker(&fs_info->endio_meta_write_workers,
&end_io_wq->work);
+ else if (end_io_wq->metadata == 2)
+ btrfs_queue_worker(&fs_info->endio_freespace_worker,
+ &end_io_wq->work);
else
btrfs_queue_worker(&fs_info->endio_write_workers,
&end_io_wq->work);
@@ -497,6 +497,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
}
}
+/*
+ * For the metadata arg you want
+ *
+ * 0 - if data
+ * 1 - if normal metadta
+ * 2 - if writing to the free space cache area
+ */
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata)
{
@@ -533,11 +540,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
static void run_one_async_start(struct btrfs_work *work)
{
- struct btrfs_fs_info *fs_info;
struct async_submit_bio *async;
async = container_of(work, struct async_submit_bio, work);
- fs_info = BTRFS_I(async->inode)->root->fs_info;
async->submit_bio_start(async->inode, async->rw, async->bio,
async->mirror_num, async->bio_flags,
async->bio_offset);
@@ -850,12 +855,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u32 blocksize, u64 parent_transid)
{
struct extent_buffer *buf = NULL;
- struct inode *btree_inode = root->fs_info->btree_inode;
- struct extent_io_tree *io_tree;
int ret;
- io_tree = &BTRFS_I(btree_inode)->io_tree;
-
buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
if (!buf)
return NULL;
@@ -1377,7 +1378,6 @@ static int bio_ready_for_csum(struct bio *bio)
u64 start = 0;
struct page *page;
struct extent_io_tree *io_tree = NULL;
- struct btrfs_fs_info *info = NULL;
struct bio_vec *bvec;
int i;
int ret;
@@ -1396,7 +1396,6 @@ static int bio_ready_for_csum(struct bio *bio)
buf_len = page->private >> 2;
start = page_offset(page) + bvec->bv_offset;
io_tree = &BTRFS_I(page->mapping->host)->io_tree;
- info = BTRFS_I(page->mapping->host)->root->fs_info;
}
/* are we fully contained in this bio? */
if (buf_len <= length)
@@ -1680,12 +1679,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_throttle);
init_waitqueue_head(&fs_info->transaction_wait);
+ init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
__setup_root(4096, 4096, 4096, 4096, tree_root,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
-
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
if (!bh)
goto fail_iput;
@@ -1775,6 +1774,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
+ 1, &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1795,6 +1796,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
+ btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1993,6 +1995,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!(sb->s_flags & MS_RDONLY)) {
down_read(&fs_info->cleanup_work_sem);
btrfs_orphan_cleanup(fs_info->fs_root);
+ btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
}
@@ -2035,6 +2038,7 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_meta_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -2410,6 +2414,7 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ btrfs_put_block_group_cache(fs_info);
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
@@ -2456,6 +2461,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_meta_workers);
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
+ btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_close_devices(fs_info->fs_devices);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0b81ecdb101..0c097f3aec4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
return NULL;
}
+ /* We're loading it the fast way, so we don't have a caching_ctl. */
+ if (!cache->caching_ctl) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
ctl = cache->caching_ctl;
atomic_inc(&ctl->count);
spin_unlock(&cache->lock);
@@ -421,7 +427,9 @@ err:
return 0;
}
-static int cache_block_group(struct btrfs_block_group_cache *cache)
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+ struct btrfs_trans_handle *trans,
+ int load_cache_only)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl;
@@ -432,6 +440,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
if (cache->cached != BTRFS_CACHE_NO)
return 0;
+ /*
+ * We can't do the read from on-disk cache during a commit since we need
+ * to have the normal tree locking.
+ */
+ if (!trans->transaction->in_commit) {
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ return 0;
+ }
+ cache->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&cache->lock);
+
+ ret = load_free_space_cache(fs_info, cache);
+
+ spin_lock(&cache->lock);
+ if (ret == 1) {
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->last_byte_to_unpin = (u64)-1;
+ } else {
+ cache->cached = BTRFS_CACHE_NO;
+ }
+ spin_unlock(&cache->lock);
+ if (ret == 1)
+ return 0;
+ }
+
+ if (load_cache_only)
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
BUG_ON(!caching_ctl);
@@ -509,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
- if (found->flags == flags) {
+ if (found->flags & flags) {
rcu_read_unlock();
return found;
}
@@ -542,6 +580,15 @@ static u64 div_factor(u64 num, int factor)
return num;
}
+static u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor == 100)
+ return num;
+ num *= factor;
+ do_div(num, 100);
+ return num;
+}
+
u64 btrfs_find_block_group(struct btrfs_root *root,
u64 search_start, u64 search_hint, int owner)
{
@@ -2687,6 +2734,109 @@ next_block_group(struct btrfs_root *root,
return cache;
}
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = block_group->fs_info->tree_root;
+ struct inode *inode = NULL;
+ u64 alloc_hint = 0;
+ int num_pages = 0;
+ int retries = 0;
+ int ret = 0;
+
+ /*
+ * If this block group is smaller than 100 megs don't bother caching the
+ * block group.
+ */
+ if (block_group->key.offset < (100 * 1024 * 1024)) {
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+again:
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ ret = PTR_ERR(inode);
+ btrfs_release_path(root, path);
+ goto out;
+ }
+
+ if (IS_ERR(inode)) {
+ BUG_ON(retries);
+ retries++;
+
+ if (block_group->ro)
+ goto out_free;
+
+ ret = create_free_space_inode(root, trans, block_group, path);
+ if (ret)
+ goto out_free;
+ goto again;
+ }
+
+ /*
+ * We want to set the generation to 0, that way if anything goes wrong
+ * from here on out we know not to trust this cache when we load up next
+ * time.
+ */
+ BTRFS_I(inode)->generation = 0;
+ ret = btrfs_update_inode(trans, root, inode);
+ WARN_ON(ret);
+
+ if (i_size_read(inode) > 0) {
+ ret = btrfs_truncate_free_space_cache(root, trans, path,
+ inode);
+ if (ret)
+ goto out_put;
+ }
+
+ spin_lock(&block_group->lock);
+ if (block_group->cached != BTRFS_CACHE_FINISHED) {
+ spin_unlock(&block_group->lock);
+ goto out_put;
+ }
+ spin_unlock(&block_group->lock);
+
+ num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+ if (!num_pages)
+ num_pages = 1;
+
+ /*
+ * Just to make absolutely sure we have enough space, we're going to
+ * preallocate 12 pages worth of space for each block group. In
+ * practice we ought to use at most 8, but we need extra space so we can
+ * add our header and have a terminator between the extents and the
+ * bitmaps.
+ */
+ num_pages *= 16;
+ num_pages *= PAGE_CACHE_SIZE;
+
+ ret = btrfs_check_data_free_space(inode, num_pages);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+ num_pages, num_pages,
+ &alloc_hint);
+ btrfs_free_reserved_data_space(inode, num_pages);
+out_put:
+ iput(inode);
+out_free:
+ btrfs_release_path(root, path);
+out:
+ spin_lock(&block_group->lock);
+ if (ret)
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+ else
+ block_group->disk_cache_state = BTRFS_DC_SETUP;
+ spin_unlock(&block_group->lock);
+
+ return ret;
+}
+
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -2699,6 +2849,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+again:
+ while (1) {
+ cache = btrfs_lookup_first_block_group(root->fs_info, last);
+ while (cache) {
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ break;
+ cache = next_block_group(root, cache);
+ }
+ if (!cache) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
+ err = cache_save_setup(cache, trans, path);
+ last = cache->key.objectid + cache->key.offset;
+ btrfs_put_block_group(cache);
+ }
+
while (1) {
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
@@ -2708,6 +2877,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_first_block_group(root->fs_info, last);
while (cache) {
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+ btrfs_put_block_group(cache);
+ goto again;
+ }
+
if (cache->dirty)
break;
cache = next_block_group(root, cache);
@@ -2719,6 +2893,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
continue;
}
+ if (cache->disk_cache_state == BTRFS_DC_SETUP)
+ cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
cache->dirty = 0;
last = cache->key.objectid + cache->key.offset;
@@ -2727,6 +2903,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
btrfs_put_block_group(cache);
}
+ while (1) {
+ /*
+ * I don't think this is needed since we're just marking our
+ * preallocated extent as written, but just in case it can't
+ * hurt.
+ */
+ if (last == 0) {
+ err = btrfs_run_delayed_refs(trans, root,
+ (unsigned long)-1);
+ BUG_ON(err);
+ }
+
+ cache = btrfs_lookup_first_block_group(root->fs_info, last);
+ while (cache) {
+ /*
+ * Really this shouldn't happen, but it could if we
+ * couldn't write the entire preallocated extent and
+ * splitting the extent resulted in a new block.
+ */
+ if (cache->dirty) {
+ btrfs_put_block_group(cache);
+ goto again;
+ }
+ if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+ break;
+ cache = next_block_group(root, cache);
+ }
+ if (!cache) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
+
+ btrfs_write_out_cache(root, trans, cache, path);
+
+ /*
+ * If we didn't have an error then the cache state is still
+ * NEED_WRITE, so we can set it to WRITTEN.
+ */
+ if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+ cache->disk_cache_state = BTRFS_DC_WRITTEN;
+ last = cache->key.objectid + cache->key.offset;
+ btrfs_put_block_group(cache);
+ }
+
btrfs_free_path(path);
return 0;
}
@@ -2762,6 +2984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (found) {
spin_lock(&found->lock);
found->total_bytes += total_bytes;
+ found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
found->full = 0;
@@ -2781,6 +3004,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
BTRFS_BLOCK_GROUP_SYSTEM |
BTRFS_BLOCK_GROUP_METADATA);
found->total_bytes = total_bytes;
+ found->disk_total = total_bytes * factor;
found->bytes_used = bytes_used;
found->disk_used = bytes_used * factor;
found->bytes_pinned = 0;
@@ -2882,11 +3106,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 used;
- int ret = 0, committed = 0;
+ int ret = 0, committed = 0, alloc_chunk = 1;
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ if (root == root->fs_info->tree_root) {
+ alloc_chunk = 0;
+ committed = 1;
+ }
+
data_sinfo = BTRFS_I(inode)->space_info;
if (!data_sinfo)
goto alloc;
@@ -2905,7 +3134,7 @@ again:
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
*/
- if (!data_sinfo->full) {
+ if (!data_sinfo->full && alloc_chunk) {
u64 alloc_target;
data_sinfo->force_alloc = 1;
@@ -2997,10 +3226,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
- u64 alloc_bytes)
+static int should_alloc_chunk(struct btrfs_root *root,
+ struct btrfs_space_info *sinfo, u64 alloc_bytes)
{
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+ u64 thresh;
if (sinfo->bytes_used + sinfo->bytes_reserved +
alloc_bytes + 256 * 1024 * 1024 < num_bytes)
@@ -3010,6 +3240,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
alloc_bytes < div_factor(num_bytes, 8))
return 0;
+ thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+
+ if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+ return 0;
+
return 1;
}
@@ -3041,13 +3277,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
- if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+ if (!force && !should_alloc_chunk(extent_root, space_info,
+ alloc_bytes)) {
spin_unlock(&space_info->lock);
goto out;
}
spin_unlock(&space_info->lock);
/*
+ * If we have mixed data/metadata chunks we want to make sure we keep
+ * allocating mixed chunks instead of individual chunks.
+ */
+ if (btrfs_mixed_space_info(space_info))
+ flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
+ /*
* if we're doing a data chunk, go ahead and make sure that
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
@@ -3072,55 +3316,25 @@ out:
return ret;
}
-static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_space_info *sinfo, u64 num_bytes)
-{
- int ret;
- int end_trans = 0;
-
- if (sinfo->full)
- return 0;
-
- spin_lock(&sinfo->lock);
- ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
- spin_unlock(&sinfo->lock);
- if (!ret)
- return 0;
-
- if (!trans) {
- trans = btrfs_join_transaction(root, 1);
- BUG_ON(IS_ERR(trans));
- end_trans = 1;
- }
-
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes + 2 * 1024 * 1024,
- get_alloc_profile(root, sinfo->flags), 0);
-
- if (end_trans)
- btrfs_end_transaction(trans, root);
-
- return ret == 1 ? 1 : 0;
-}
-
/*
* shrink metadata reservation for delalloc
*/
static int shrink_delalloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 to_reclaim)
+ struct btrfs_root *root, u64 to_reclaim, int sync)
{
struct btrfs_block_rsv *block_rsv;
+ struct btrfs_space_info *space_info;
u64 reserved;
u64 max_reclaim;
u64 reclaimed = 0;
int pause = 1;
- int ret;
+ int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
block_rsv = &root->fs_info->delalloc_block_rsv;
- spin_lock(&block_rsv->lock);
- reserved = block_rsv->reserved;
- spin_unlock(&block_rsv->lock);
+ space_info = block_rsv->space_info;
+
+ smp_mb();
+ reserved = space_info->bytes_reserved;
if (reserved == 0)
return 0;
@@ -3128,104 +3342,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
max_reclaim = min(reserved, to_reclaim);
while (1) {
- ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
- if (!ret) {
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(pause);
- pause <<= 1;
- if (pause > HZ / 10)
- pause = HZ / 10;
- } else {
- pause = 1;
- }
+ /* have the flusher threads jump in and do some IO */
+ smp_mb();
+ nr_pages = min_t(unsigned long, nr_pages,
+ root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+ writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
- spin_lock(&block_rsv->lock);
- if (reserved > block_rsv->reserved)
- reclaimed = reserved - block_rsv->reserved;
- reserved = block_rsv->reserved;
- spin_unlock(&block_rsv->lock);
+ spin_lock(&space_info->lock);
+ if (reserved > space_info->bytes_reserved)
+ reclaimed += reserved - space_info->bytes_reserved;
+ reserved = space_info->bytes_reserved;
+ spin_unlock(&space_info->lock);
if (reserved == 0 || reclaimed >= max_reclaim)
break;
if (trans && trans->transaction->blocked)
return -EAGAIN;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(pause);
+ pause <<= 1;
+ if (pause > HZ / 10)
+ pause = HZ / 10;
+
}
return reclaimed >= to_reclaim;
}
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int *retries)
+/*
+ * Retries tells us how many times we've called reserve_metadata_bytes. The
+ * idea is if this is the first call (retries == 0) then we will add to our
+ * reserved count if we can't make the allocation in order to hold our place
+ * while we go and try and free up space. That way for retries > 1 we don't try
+ * and add space, we just check to see if the amount of unused space is >= the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don't intend to retry this reservation, pass -1 as retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes, int flush)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
- int ret;
+ u64 unused;
+ u64 num_bytes = orig_bytes;
+ int retries = 0;
+ int ret = 0;
+ bool reserved = false;
+ bool committed = false;
- if ((*retries) > 2)
- return -ENOSPC;
+again:
+ ret = -ENOSPC;
+ if (reserved)
+ num_bytes = 0;
- ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
- if (ret)
- return 1;
+ spin_lock(&space_info->lock);
+ unused = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
- if (trans && trans->transaction->in_commit)
- return -ENOSPC;
+ /*
+ * The idea here is that we've not already over-reserved the block group
+ * then we can go ahead and save our reservation first and then start
+ * flushing if we need to. Otherwise if we've already overcommitted
+ * lets start flushing stuff first and then come back and try to make
+ * our reservation.
+ */
+ if (unused <= space_info->total_bytes) {
+ unused -= space_info->total_bytes;
+ if (unused >= num_bytes) {
+ if (!reserved)
+ space_info->bytes_reserved += orig_bytes;
+ ret = 0;
+ } else {
+ /*
+ * Ok set num_bytes to orig_bytes since we aren't
+ * overocmmitted, this way we only try and reclaim what
+ * we need.
+ */
+ num_bytes = orig_bytes;
+ }
+ } else {
+ /*
+ * Ok we're over committed, set num_bytes to the overcommitted
+ * amount plus the amount of bytes that we need for this
+ * reservation.
+ */
+ num_bytes = unused - space_info->total_bytes +
+ (orig_bytes * (retries + 1));
+ }
- ret = shrink_delalloc(trans, root, num_bytes);
- if (ret)
- return ret;
+ /*
+ * Couldn't make our reservation, save our place so while we're trying
+ * to reclaim space we can actually use it instead of somebody else
+ * stealing it from us.
+ */
+ if (ret && !reserved) {
+ space_info->bytes_reserved += orig_bytes;
+ reserved = true;
+ }
- spin_lock(&space_info->lock);
- if (space_info->bytes_pinned < num_bytes)
- ret = 1;
spin_unlock(&space_info->lock);
- if (ret)
- return -ENOSPC;
-
- (*retries)++;
- if (trans)
- return -EAGAIN;
+ if (!ret)
+ return 0;
- trans = btrfs_join_transaction(root, 1);
- BUG_ON(IS_ERR(trans));
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ if (!flush)
+ goto out;
- return 1;
-}
+ /*
+ * We do synchronous shrinking since we don't actually unreserve
+ * metadata until after the IO is completed.
+ */
+ ret = shrink_delalloc(trans, root, num_bytes, 1);
+ if (ret > 0)
+ return 0;
+ else if (ret < 0)
+ goto out;
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- struct btrfs_space_info *space_info = block_rsv->space_info;
- u64 unused;
- int ret = -ENOSPC;
+ /*
+ * So if we were overcommitted it's possible that somebody else flushed
+ * out enough space and we simply didn't have enough space to reclaim,
+ * so go back around and try again.
+ */
+ if (retries < 2) {
+ retries++;
+ goto again;
+ }
spin_lock(&space_info->lock);
- unused = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly;
+ /*
+ * Not enough space to be reclaimed, don't bother committing the
+ * transaction.
+ */
+ if (space_info->bytes_pinned < orig_bytes)
+ ret = -ENOSPC;
+ spin_unlock(&space_info->lock);
+ if (ret)
+ goto out;
- if (unused < space_info->total_bytes)
- unused = space_info->total_bytes - unused;
- else
- unused = 0;
+ ret = -EAGAIN;
+ if (trans || committed)
+ goto out;
- if (unused >= num_bytes) {
- if (block_rsv->priority >= 10) {
- space_info->bytes_reserved += num_bytes;
- ret = 0;
- } else {
- if ((unused + block_rsv->reserved) *
- block_rsv->priority >=
- (num_bytes + block_rsv->reserved) * 10) {
- space_info->bytes_reserved += num_bytes;
- ret = 0;
- }
- }
+ ret = -ENOSPC;
+ trans = btrfs_join_transaction(root, 1);
+ if (IS_ERR(trans))
+ goto out;
+ ret = btrfs_commit_transaction(trans, root);
+ if (!ret) {
+ trans = NULL;
+ committed = true;
+ goto again;
+ }
+
+out:
+ if (reserved) {
+ spin_lock(&space_info->lock);
+ space_info->bytes_reserved -= orig_bytes;
+ spin_unlock(&space_info->lock);
}
- spin_unlock(&space_info->lock);
return ret;
}
@@ -3327,18 +3606,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 alloc_target;
block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
if (!block_rsv)
return NULL;
btrfs_init_block_rsv(block_rsv);
-
- alloc_target = btrfs_get_alloc_profile(root, 0);
block_rsv->space_info = __find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
-
return block_rsv;
}
@@ -3369,23 +3644,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int *retries)
+ u64 num_bytes)
{
int ret;
if (num_bytes == 0)
return 0;
-again:
- ret = reserve_metadata_bytes(block_rsv, num_bytes);
+
+ ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 1);
return 0;
}
- ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
- if (ret > 0)
- goto again;
-
return ret;
}
@@ -3420,7 +3691,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
return 0;
if (block_rsv->refill_used) {
- ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ ret = reserve_metadata_bytes(trans, root, block_rsv,
+ num_bytes, 0);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 0);
return 0;
@@ -3499,6 +3771,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
spin_lock(&sinfo->lock);
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+ data_used = 0;
meta_used = sinfo->bytes_used;
spin_unlock(&sinfo->lock);
@@ -3526,7 +3800,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
block_rsv->size = num_bytes;
num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
- sinfo->bytes_reserved + sinfo->bytes_readonly;
+ sinfo->bytes_reserved + sinfo->bytes_readonly +
+ sinfo->bytes_may_use;
if (sinfo->total_bytes > num_bytes) {
num_bytes = sinfo->total_bytes - num_bytes;
@@ -3597,7 +3872,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- int num_items, int *retries)
+ int num_items)
{
u64 num_bytes;
int ret;
@@ -3607,7 +3882,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
num_bytes = calc_trans_metadata_size(root, num_items);
ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
- num_bytes, retries);
+ num_bytes);
if (!ret) {
trans->bytes_reserved += num_bytes;
trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3681,14 +3956,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
u64 to_reserve;
int nr_extents;
- int retries = 0;
int ret;
if (btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
+
spin_lock(&BTRFS_I(inode)->accounting_lock);
nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3698,18 +3972,14 @@ again:
nr_extents = 0;
to_reserve = 0;
}
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
to_reserve += calc_csum_metadata_size(inode, num_bytes);
- ret = reserve_metadata_bytes(block_rsv, to_reserve);
- if (ret) {
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
- &retries);
- if (ret > 0)
- goto again;
+ ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+ if (ret)
return ret;
- }
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
BTRFS_I(inode)->reserved_extents += nr_extents;
atomic_inc(&BTRFS_I(inode)->outstanding_extents);
spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -3717,7 +3987,7 @@ again:
block_rsv_add_bytes(block_rsv, to_reserve, 1);
if (block_rsv->size > 512 * 1024 * 1024)
- shrink_delalloc(NULL, root, to_reserve);
+ shrink_delalloc(NULL, root, to_reserve, 0);
return 0;
}
@@ -3776,12 +4046,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group_cache *cache = NULL;
struct btrfs_fs_info *info = root->fs_info;
- int factor;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
+ int factor;
/* block accounting for super block */
spin_lock(&info->delalloc_lock);
@@ -3803,11 +4073,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
factor = 2;
else
factor = 1;
+ /*
+ * If this block group has free space cache written out, we
+ * need to make sure to load it if we are removing space. This
+ * is because we need the unpinning stage to actually add the
+ * space back to the block group, otherwise we will leak space.
+ */
+ if (!alloc && cache->cached == BTRFS_CACHE_NO)
+ cache_block_group(cache, trans, 1);
+
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
+
+ if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+
cache->dirty = 1;
old_val = btrfs_block_group_used(&cache->item);
num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -4554,6 +4838,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
bool failed_alloc = false;
+ bool use_cluster = true;
u64 ideal_cache_percent = 0;
u64 ideal_cache_offset = 0;
@@ -4568,16 +4853,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
return -ENOSPC;
}
+ /*
+ * If the space info is for both data and metadata it means we have a
+ * small filesystem and we can't use the clustering stuff.
+ */
+ if (btrfs_mixed_space_info(space_info))
+ use_cluster = false;
+
if (orig_root->ref_cows || empty_size)
allowed_chunk_alloc = 1;
- if (data & BTRFS_BLOCK_GROUP_METADATA) {
+ if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
empty_cluster = 64 * 1024;
}
- if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+ if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+ btrfs_test_opt(root, SSD)) {
last_ptr = &root->fs_info->data_alloc_cluster;
}
@@ -4641,6 +4934,10 @@ have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
u64 free_percent;
+ ret = cache_block_group(block_group, trans, 1);
+ if (block_group->cached == BTRFS_CACHE_FINISHED)
+ goto have_block_group;
+
free_percent = btrfs_block_group_used(&block_group->item);
free_percent *= 100;
free_percent = div64_u64(free_percent,
@@ -4661,7 +4958,7 @@ have_block_group:
if (loop > LOOP_CACHING_NOWAIT ||
(loop > LOOP_FIND_IDEAL &&
atomic_read(&space_info->caching_threads) < 2)) {
- ret = cache_block_group(block_group);
+ ret = cache_block_group(block_group, trans, 0);
BUG_ON(ret);
}
found_uncached_bg = true;
@@ -5218,7 +5515,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group);
+ cache_block_group(block_group, trans, 0);
caching_ctl = get_caching_control(block_group);
if (!caching_ctl) {
@@ -5308,7 +5605,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(block_rsv, blocksize);
+ ret = reserve_metadata_bytes(trans, root, block_rsv,
+ blocksize, 0);
if (ret)
return ERR_PTR(ret);
return block_rsv;
@@ -5318,11 +5616,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
if (!ret)
return block_rsv;
- WARN_ON(1);
- printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
- block_rsv->size, block_rsv->reserved,
- block_rsv->freed[0], block_rsv->freed[1]);
-
return ERR_PTR(-ENOSPC);
}
@@ -5421,7 +5714,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
u64 generation;
u64 refs;
u64 flags;
- u64 last = 0;
u32 nritems;
u32 blocksize;
struct btrfs_key key;
@@ -5489,7 +5781,6 @@ reada:
generation);
if (ret)
break;
- last = bytenr + blocksize;
nread++;
}
wc->reada_slot = slot;
@@ -7813,6 +8104,40 @@ out:
return ret;
}
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+ struct btrfs_block_group_cache *block_group;
+ u64 last = 0;
+
+ while (1) {
+ struct inode *inode;
+
+ block_group = btrfs_lookup_first_block_group(info, last);
+ while (block_group) {
+ spin_lock(&block_group->lock);
+ if (block_group->iref)
+ break;
+ spin_unlock(&block_group->lock);
+ block_group = next_block_group(info->tree_root,
+ block_group);
+ }
+ if (!block_group) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
+
+ inode = block_group->inode;
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ last = block_group->key.objectid + block_group->key.offset;
+ btrfs_put_block_group(block_group);
+ }
+}
+
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_block_group_cache *block_group;
@@ -7896,6 +8221,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf;
+ int need_clear = 0;
+ u64 cache_gen;
root = info->extent_root;
key.objectid = 0;
@@ -7905,6 +8232,15 @@ int btrfs_read_block_groups(struct btrfs_root *root)
if (!path)
return -ENOMEM;
+ cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+ if (cache_gen != 0 &&
+ btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+ need_clear = 1;
+ if (btrfs_test_opt(root, CLEAR_CACHE))
+ need_clear = 1;
+ if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+ printk(KERN_INFO "btrfs: disk space caching is enabled\n");
+
while (1) {
ret = find_first_block_group(root, path, &key);
if (ret > 0)
@@ -7927,6 +8263,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
+ if (need_clear)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+
/*
* we only want to have 32k of ram per block group for keeping
* track of free space, and if we pass 1/2 of that we want to
@@ -8031,6 +8370,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->key.offset = size;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
cache->sectorsize = root->sectorsize;
+ cache->fs_info = root->fs_info;
/*
* we only want to have 32k of ram per block group for keeping track
@@ -8087,8 +8427,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_block_group_cache *block_group;
struct btrfs_free_cluster *cluster;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
struct btrfs_key key;
+ struct inode *inode;
int ret;
+ int factor;
root = root->fs_info->extent_root;
@@ -8097,6 +8440,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
BUG_ON(!block_group->ro);
memcpy(&key, &block_group->key, sizeof(key));
+ if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
/* make sure this block group isn't part of an allocation cluster */
cluster = &root->fs_info->data_alloc_cluster;
@@ -8116,6 +8465,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (!IS_ERR(inode)) {
+ btrfs_orphan_add(trans, inode);
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (block_group->iref) {
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for our lookup ref */
+ iput(inode);
+ }
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = block_group->key.objectid;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ btrfs_release_path(tree_root, path);
+ if (ret == 0) {
+ ret = btrfs_del_item(trans, tree_root, path);
+ if (ret)
+ goto out;
+ btrfs_release_path(tree_root, path);
+ }
+
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
@@ -8137,8 +8520,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&block_group->space_info->lock);
block_group->space_info->total_bytes -= block_group->key.offset;
block_group->space_info->bytes_readonly -= block_group->key.offset;
+ block_group->space_info->disk_total -= block_group->key.offset * factor;
spin_unlock(&block_group->space_info->lock);
+ memcpy(&key, &block_group->key, sizeof(key));
+
btrfs_clear_space_info_full(root->fs_info);
btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53..eac10e3260a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
struct address_space *mapping, gfp_t mask)
{
tree->state = RB_ROOT;
- tree->buffer = RB_ROOT;
+ INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
@@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
return ret;
}
-static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
- u64 offset, struct rb_node *node)
-{
- struct rb_root *root = &tree->buffer;
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct extent_buffer *eb;
-
- while (*p) {
- parent = *p;
- eb = rb_entry(parent, struct extent_buffer, rb_node);
-
- if (offset < eb->start)
- p = &(*p)->rb_left;
- else if (offset > eb->start)
- p = &(*p)->rb_right;
- else
- return eb;
- }
-
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
-static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
- u64 offset)
-{
- struct rb_root *root = &tree->buffer;
- struct rb_node *n = root->rb_node;
- struct extent_buffer *eb;
-
- while (n) {
- eb = rb_entry(n, struct extent_buffer, rb_node);
- if (offset < eb->start)
- n = n->rb_left;
- else if (offset > eb->start)
- n = n->rb_right;
- else
- return eb;
- }
- return NULL;
-}
-
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
@@ -1901,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
- u64 end;
start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
bio->bi_private = NULL;
@@ -2204,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 last_byte = i_size_read(inode);
u64 block_start;
u64 iosize;
- u64 unlock_start;
sector_t sector;
struct extent_state *cached_state = NULL;
struct extent_map *em;
@@ -2329,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
- unlock_start = page_end + 1;
goto done;
}
@@ -2340,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
- unlock_start = page_end + 1;
break;
}
em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2387,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
cur += iosize;
pg_offset += iosize;
- unlock_start = cur;
continue;
}
/* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int range_whole = 0;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -2482,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
scanned = 1;
}
retry:
@@ -2823,6 +2770,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
NULL, 1,
end_bio_extent_preparewrite, 0,
0, 0);
+ if (ret && !err)
+ err = ret;
iocount++;
block_start = block_start + iosize;
} else {
@@ -3104,6 +3053,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
kmem_cache_free(extent_buffer_cache, eb);
}
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+ unsigned long start_idx)
+{
+ unsigned long index;
+ struct page *page;
+
+ if (!eb->first_page)
+ return;
+
+ index = num_extent_pages(eb->start, eb->len);
+ if (start_idx >= index)
+ return;
+
+ do {
+ index--;
+ page = extent_buffer_page(eb, index);
+ if (page)
+ page_cache_release(page);
+ } while (index != start_idx);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+ btrfs_release_extent_buffer_page(eb, 0);
+ __free_extent_buffer(eb);
+}
+
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len,
struct page *page0,
@@ -3117,16 +3099,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
struct page *p;
struct address_space *mapping = tree->mapping;
int uptodate = 1;
+ int ret;
- spin_lock(&tree->buffer_lock);
- eb = buffer_search(tree, start);
- if (eb) {
- atomic_inc(&eb->refs);
- spin_unlock(&tree->buffer_lock);
+ rcu_read_lock();
+ eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
mark_page_accessed(eb->first_page);
return eb;
}
- spin_unlock(&tree->buffer_lock);
+ rcu_read_unlock();
eb = __alloc_extent_buffer(tree, start, len, mask);
if (!eb)
@@ -3165,26 +3147,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto free_eb;
+
spin_lock(&tree->buffer_lock);
- exists = buffer_tree_insert(tree, start, &eb->rb_node);
- if (exists) {
+ ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+ if (ret == -EEXIST) {
+ exists = radix_tree_lookup(&tree->buffer,
+ start >> PAGE_CACHE_SHIFT);
/* add one reference for the caller */
atomic_inc(&exists->refs);
spin_unlock(&tree->buffer_lock);
+ radix_tree_preload_end();
goto free_eb;
}
/* add one reference for the tree */
atomic_inc(&eb->refs);
spin_unlock(&tree->buffer_lock);
+ radix_tree_preload_end();
return eb;
free_eb:
if (!atomic_dec_and_test(&eb->refs))
return exists;
- for (index = 1; index < i; index++)
- page_cache_release(extent_buffer_page(eb, index));
- page_cache_release(extent_buffer_page(eb, 0));
- __free_extent_buffer(eb);
+ btrfs_release_extent_buffer(eb);
return exists;
}
@@ -3194,16 +3181,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
{
struct extent_buffer *eb;
- spin_lock(&tree->buffer_lock);
- eb = buffer_search(tree, start);
- if (eb)
- atomic_inc(&eb->refs);
- spin_unlock(&tree->buffer_lock);
-
- if (eb)
+ rcu_read_lock();
+ eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
mark_page_accessed(eb->first_page);
+ return eb;
+ }
+ rcu_read_unlock();
- return eb;
+ return NULL;
}
void free_extent_buffer(struct extent_buffer *eb)
@@ -3833,34 +3820,45 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
}
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+ struct extent_buffer *eb =
+ container_of(head, struct extent_buffer, rcu_head);
+
+ btrfs_release_extent_buffer(eb);
+}
+
int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
{
u64 start = page_offset(page);
struct extent_buffer *eb;
int ret = 1;
- unsigned long i;
- unsigned long num_pages;
spin_lock(&tree->buffer_lock);
- eb = buffer_search(tree, start);
+ eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (!eb)
goto out;
- if (atomic_read(&eb->refs) > 1) {
+ if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
ret = 0;
goto out;
}
- if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+
+ /*
+ * set @eb->refs to 0 if it is already 1, and then release the @eb.
+ * Or go back.
+ */
+ if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
ret = 0;
goto out;
}
- /* at this point we can safely release the extent buffer */
- num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++)
- page_cache_release(extent_buffer_page(eb, i));
- rb_erase(&eb->rb_node, &tree->buffer);
- __free_extent_buffer(eb);
+
+ radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
out:
spin_unlock(&tree->buffer_lock);
+
+ /* at this point we can safely release the extent buffer */
+ if (atomic_read(&eb->refs) == 0)
+ call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return ret;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590d..1c6d4f342ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -85,7 +85,7 @@ struct extent_io_ops {
struct extent_io_tree {
struct rb_root state;
- struct rb_root buffer;
+ struct radix_tree_root buffer;
struct address_space *mapping;
u64 dirty_bytes;
spinlock_t lock;
@@ -123,7 +123,7 @@ struct extent_buffer {
unsigned long bflags;
atomic_t refs;
struct list_head leak_list;
- struct rb_node rb_node;
+ struct rcu_head rcu_head;
/* the spinlock is used to protect most operations */
spinlock_t lock;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d645..23cb8da3ff6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
goto out;
}
if (IS_ERR(rb_node)) {
- em = ERR_PTR(PTR_ERR(rb_node));
+ em = ERR_CAST(rb_node);
goto out;
}
em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
goto out;
}
if (IS_ERR(rb_node)) {
- em = ERR_PTR(PTR_ERR(rb_node));
+ em = ERR_CAST(rb_node);
goto out;
}
em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d9..22ee0dc2e6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,10 +23,761 @@
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
+#include "disk-io.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+static void recalculate_thresholds(struct btrfs_block_group_cache
+ *block_group);
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info);
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_block_group_cache
+ *block_group, struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct btrfs_key location;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ struct inode *inode = NULL;
+ int ret;
+
+ spin_lock(&block_group->lock);
+ if (block_group->inode)
+ inode = igrab(block_group->inode);
+ spin_unlock(&block_group->lock);
+ if (inode)
+ return inode;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = block_group->key.objectid;
+ key.type = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0) {
+ btrfs_release_path(root, path);
+ return ERR_PTR(-ENOENT);
+ }
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_free_space_key(leaf, header, &disk_key);
+ btrfs_disk_key_to_cpu(&location, &disk_key);
+ btrfs_release_path(root, path);
+
+ inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+ if (IS_ERR(inode))
+ return inode;
+ if (is_bad_inode(inode)) {
+ iput(inode);
+ return ERR_PTR(-ENOENT);
+ }
+
+ spin_lock(&block_group->lock);
+ if (!root->fs_info->closing) {
+ block_group->inode = igrab(inode);
+ block_group->iref = 1;
+ }
+ spin_unlock(&block_group->lock);
+
+ return inode;
+}
+
+int create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ struct btrfs_disk_key disk_key;
+ struct btrfs_free_space_header *header;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ u64 objectid;
+ int ret;
+
+ ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+ if (ret)
+ return ret;
+
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ btrfs_item_key(leaf, &disk_key, path->slots[0]);
+ memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+ sizeof(*inode_item));
+ btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+ btrfs_set_inode_size(leaf, inode_item, 0);
+ btrfs_set_inode_nbytes(leaf, inode_item, 0);
+ btrfs_set_inode_uid(leaf, inode_item, 0);
+ btrfs_set_inode_gid(leaf, inode_item, 0);
+ btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+ btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+ btrfs_set_inode_nlink(leaf, inode_item, 1);
+ btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+ btrfs_set_inode_block_group(leaf, inode_item,
+ block_group->key.objectid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(root, path);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = block_group->key.objectid;
+ key.type = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_free_space_header));
+ if (ret < 0) {
+ btrfs_release_path(root, path);
+ return ret;
+ }
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+ btrfs_set_free_space_key(leaf, header, &disk_key);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(root, path);
+
+ return 0;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct inode *inode)
+{
+ loff_t oldsize;
+ int ret = 0;
+
+ trans->block_rsv = root->orphan_block_rsv;
+ ret = btrfs_block_rsv_check(trans, root,
+ root->orphan_block_rsv,
+ 0, 5);
+ if (ret)
+ return ret;
+
+ oldsize = i_size_read(inode);
+ btrfs_i_size_write(inode, 0);
+ truncate_pagecache(inode, oldsize, 0);
+
+ /*
+ * We don't need an orphan item because truncating the free space cache
+ * will never be split across transactions.
+ */
+ ret = btrfs_truncate_inode_items(trans, root, inode,
+ 0, BTRFS_EXTENT_DATA_KEY);
+ if (ret) {
+ WARN_ON(1);
+ return ret;
+ }
+
+ return btrfs_update_inode(trans, root, inode);
+}
+
+static int readahead_cache(struct inode *inode)
+{
+ struct file_ra_state *ra;
+ unsigned long last_index;
+
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+
+ file_ra_state_init(ra, inode->i_mapping);
+ last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+
+ kfree(ra);
+
+ return 0;
+}
+
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct inode *inode;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ struct page *page;
+ struct btrfs_path *path;
+ u32 *checksums = NULL, *crc;
+ char *disk_crcs = NULL;
+ struct btrfs_key key;
+ struct list_head bitmaps;
+ u64 num_entries;
+ u64 num_bitmaps;
+ u64 generation;
+ u32 cur_crc = ~(u32)0;
+ pgoff_t index = 0;
+ unsigned long first_page_offset;
+ int num_checksums;
+ int ret = 0;
+
+ /*
+ * If we're unmounting then just return, since this does a search on the
+ * normal root and not the commit root and we could deadlock.
+ */
+ smp_mb();
+ if (fs_info->closing)
+ return 0;
+
+ /*
+ * If this block group has been marked to be cleared for one reason or
+ * another then we can't trust the on disk cache, so just return.
+ */
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ spin_unlock(&block_group->lock);
+
+ INIT_LIST_HEAD(&bitmaps);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return 0;
+
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode)) {
+ btrfs_free_path(path);
+ return 0;
+ }
+
+ /* Nothing in the space cache, goodbye */
+ if (!i_size_read(inode)) {
+ btrfs_free_path(path);
+ goto out;
+ }
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = block_group->key.objectid;
+ key.type = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret) {
+ btrfs_free_path(path);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ num_entries = btrfs_free_space_entries(leaf, header);
+ num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+ generation = btrfs_free_space_generation(leaf, header);
+ btrfs_free_path(path);
+
+ if (BTRFS_I(inode)->generation != generation) {
+ printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
+ " not match free space cache generation (%llu) for "
+ "block group %llu\n",
+ (unsigned long long)BTRFS_I(inode)->generation,
+ (unsigned long long)generation,
+ (unsigned long long)block_group->key.objectid);
+ goto out;
+ }
+
+ if (!num_entries)
+ goto out;
+
+ /* Setup everything for doing checksumming */
+ num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+ checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+ if (!checksums)
+ goto out;
+ first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+ disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
+ if (!disk_crcs)
+ goto out;
+
+ ret = readahead_cache(inode);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ while (1) {
+ struct btrfs_free_space_entry *entry;
+ struct btrfs_free_space *e;
+ void *addr;
+ unsigned long offset = 0;
+ unsigned long start_offset = 0;
+ int need_loop = 0;
+
+ if (!num_entries && !num_bitmaps)
+ break;
+
+ if (index == 0) {
+ start_offset = first_page_offset;
+ offset = start_offset;
+ }
+
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page) {
+ ret = 0;
+ goto free_cache;
+ }
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ printk(KERN_ERR "btrfs: error reading free "
+ "space cache: %llu\n",
+ (unsigned long long)
+ block_group->key.objectid);
+ goto free_cache;
+ }
+ }
+ addr = kmap(page);
+
+ if (index == 0) {
+ u64 *gen;
+
+ memcpy(disk_crcs, addr, first_page_offset);
+ gen = addr + (sizeof(u32) * num_checksums);
+ if (*gen != BTRFS_I(inode)->generation) {
+ printk(KERN_ERR "btrfs: space cache generation"
+ " (%llu) does not match inode (%llu) "
+ "for block group %llu\n",
+ (unsigned long long)*gen,
+ (unsigned long long)
+ BTRFS_I(inode)->generation,
+ (unsigned long long)
+ block_group->key.objectid);
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ goto free_cache;
+ }
+ crc = (u32 *)disk_crcs;
+ }
+ entry = addr + start_offset;
+
+ /* First lets check our crc before we do anything fun */
+ cur_crc = ~(u32)0;
+ cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
+ PAGE_CACHE_SIZE - start_offset);
+ btrfs_csum_final(cur_crc, (char *)&cur_crc);
+ if (cur_crc != *crc) {
+ printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
+ "block group %llu\n", index,
+ (unsigned long long)block_group->key.objectid);
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ goto free_cache;
+ }
+ crc++;
+
+ while (1) {
+ if (!num_entries)
+ break;
+
+ need_loop = 1;
+ e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+ if (!e) {
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ goto free_cache;
+ }
+
+ e->offset = le64_to_cpu(entry->offset);
+ e->bytes = le64_to_cpu(entry->bytes);
+ if (!e->bytes) {
+ kunmap(page);
+ kfree(e);
+ unlock_page(page);
+ page_cache_release(page);
+ goto free_cache;
+ }
+
+ if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
+ spin_lock(&block_group->tree_lock);
+ ret = link_free_space(block_group, e);
+ spin_unlock(&block_group->tree_lock);
+ BUG_ON(ret);
+ } else {
+ e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ if (!e->bitmap) {
+ kunmap(page);
+ kfree(e);
+ unlock_page(page);
+ page_cache_release(page);
+ goto free_cache;
+ }
+ spin_lock(&block_group->tree_lock);
+ ret = link_free_space(block_group, e);
+ block_group->total_bitmaps++;
+ recalculate_thresholds(block_group);
+ spin_unlock(&block_group->tree_lock);
+ list_add_tail(&e->list, &bitmaps);
+ }
+
+ num_entries--;
+ offset += sizeof(struct btrfs_free_space_entry);
+ if (offset + sizeof(struct btrfs_free_space_entry) >=
+ PAGE_CACHE_SIZE)
+ break;
+ entry++;
+ }
+
+ /*
+ * We read an entry out of this page, we need to move on to the
+ * next page.
+ */
+ if (need_loop) {
+ kunmap(page);
+ goto next;
+ }
+
+ /*
+ * We add the bitmaps at the end of the entries in order that
+ * the bitmap entries are added to the cache.
+ */
+ e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+ list_del_init(&e->list);
+ memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+ kunmap(page);
+ num_bitmaps--;
+next:
+ unlock_page(page);
+ page_cache_release(page);
+ index++;
+ }
+
+ ret = 1;
+out:
+ kfree(checksums);
+ kfree(disk_crcs);
+ iput(inode);
+ return ret;
+
+free_cache:
+ /* This cache is bogus, make sure it gets cleared */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ btrfs_remove_free_space_cache(block_group);
+ goto out;
+}
+
+int btrfs_write_out_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ struct inode *inode;
+ struct rb_node *node;
+ struct list_head *pos, *n;
+ struct page *page;
+ struct extent_state *cached_state = NULL;
+ struct list_head bitmap_list;
+ struct btrfs_key key;
+ u64 bytes = 0;
+ u32 *crc, *checksums;
+ pgoff_t index = 0, last_index = 0;
+ unsigned long first_page_offset;
+ int num_checksums;
+ int entries = 0;
+ int bitmaps = 0;
+ int ret = 0;
+
+ root = root->fs_info->tree_root;
+
+ INIT_LIST_HEAD(&bitmap_list);
+
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ spin_unlock(&block_group->lock);
+
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode))
+ return 0;
+
+ if (!i_size_read(inode)) {
+ iput(inode);
+ return 0;
+ }
+
+ last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+ filemap_write_and_wait(inode->i_mapping);
+ btrfs_wait_ordered_range(inode, inode->i_size &
+ ~(root->sectorsize - 1), (u64)-1);
+
+ /* We need a checksum per page. */
+ num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+ crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+ if (!crc) {
+ iput(inode);
+ return 0;
+ }
+
+ /* Since the first page has all of our checksums and our generation we
+ * need to calculate the offset into the page that we can start writing
+ * our entries.
+ */
+ first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+
+ node = rb_first(&block_group->free_space_offset);
+ if (!node)
+ goto out_free;
+
+ /*
+ * Lock all pages first so we can lock the extent safely.
+ *
+ * NOTE: Because we hold the ref the entire time we're going to write to
+ * the page find_get_page should never fail, so we don't do a check
+ * after find_get_page at this point. Just putting this here so people
+ * know and don't freak out.
+ */
+ while (index <= last_index) {
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page) {
+ pgoff_t i = 0;
+
+ while (i < index) {
+ page = find_get_page(inode->i_mapping, i);
+ unlock_page(page);
+ page_cache_release(page);
+ page_cache_release(page);
+ i++;
+ }
+ goto out_free;
+ }
+ index++;
+ }
+
+ index = 0;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ 0, &cached_state, GFP_NOFS);
+
+ /* Write out the extent entries */
+ do {
+ struct btrfs_free_space_entry *entry;
+ void *addr;
+ unsigned long offset = 0;
+ unsigned long start_offset = 0;
+
+ if (index == 0) {
+ start_offset = first_page_offset;
+ offset = start_offset;
+ }
+
+ page = find_get_page(inode->i_mapping, index);
+
+ addr = kmap(page);
+ entry = addr + start_offset;
+
+ memset(addr, 0, PAGE_CACHE_SIZE);
+ while (1) {
+ struct btrfs_free_space *e;
+
+ e = rb_entry(node, struct btrfs_free_space, offset_index);
+ entries++;
+
+ entry->offset = cpu_to_le64(e->offset);
+ entry->bytes = cpu_to_le64(e->bytes);
+ if (e->bitmap) {
+ entry->type = BTRFS_FREE_SPACE_BITMAP;
+ list_add_tail(&e->list, &bitmap_list);
+ bitmaps++;
+ } else {
+ entry->type = BTRFS_FREE_SPACE_EXTENT;
+ }
+ node = rb_next(node);
+ if (!node)
+ break;
+ offset += sizeof(struct btrfs_free_space_entry);
+ if (offset + sizeof(struct btrfs_free_space_entry) >=
+ PAGE_CACHE_SIZE)
+ break;
+ entry++;
+ }
+ *crc = ~(u32)0;
+ *crc = btrfs_csum_data(root, addr + start_offset, *crc,
+ PAGE_CACHE_SIZE - start_offset);
+ kunmap(page);
+
+ btrfs_csum_final(*crc, (char *)crc);
+ crc++;
+
+ bytes += PAGE_CACHE_SIZE;
+
+ ClearPageChecked(page);
+ set_page_extent_mapped(page);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ /*
+ * We need to release our reference we got for grab_cache_page,
+ * except for the first page which will hold our checksums, we
+ * do that below.
+ */
+ if (index != 0) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ page_cache_release(page);
+
+ index++;
+ } while (node);
+
+ /* Write out the bitmaps */
+ list_for_each_safe(pos, n, &bitmap_list) {
+ void *addr;
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+
+ page = find_get_page(inode->i_mapping, index);
+
+ addr = kmap(page);
+ memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
+ *crc = ~(u32)0;
+ *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
+ kunmap(page);
+ btrfs_csum_final(*crc, (char *)crc);
+ crc++;
+ bytes += PAGE_CACHE_SIZE;
+
+ ClearPageChecked(page);
+ set_page_extent_mapped(page);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ page_cache_release(page);
+ list_del_init(&entry->list);
+ index++;
+ }
+
+ /* Zero out the rest of the pages just to make sure */
+ while (index <= last_index) {
+ void *addr;
+
+ page = find_get_page(inode->i_mapping, index);
+
+ addr = kmap(page);
+ memset(addr, 0, PAGE_CACHE_SIZE);
+ kunmap(page);
+ ClearPageChecked(page);
+ set_page_extent_mapped(page);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ page_cache_release(page);
+ bytes += PAGE_CACHE_SIZE;
+ index++;
+ }
+
+ btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
+
+ /* Write the checksums and trans id to the first page */
+ {
+ void *addr;
+ u64 *gen;
+
+ page = find_get_page(inode->i_mapping, 0);
+
+ addr = kmap(page);
+ memcpy(addr, checksums, sizeof(u32) * num_checksums);
+ gen = addr + (sizeof(u32) * num_checksums);
+ *gen = trans->transid;
+ kunmap(page);
+ ClearPageChecked(page);
+ set_page_extent_mapped(page);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ page_cache_release(page);
+ }
+ BTRFS_I(inode)->generation = trans->transid;
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+ filemap_write_and_wait(inode->i_mapping);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = block_group->key.objectid;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+ if (ret < 0) {
+ ret = 0;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+ goto out_free;
+ }
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ struct btrfs_key found_key;
+ BUG_ON(!path->slots[0]);
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+ found_key.offset != block_group->key.objectid) {
+ ret = 0;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+ GFP_NOFS);
+ btrfs_release_path(root, path);
+ goto out_free;
+ }
+ }
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_set_free_space_entries(leaf, header, entries);
+ btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+ btrfs_set_free_space_generation(leaf, header, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(root, path);
+
+ ret = 1;
+
+out_free:
+ if (ret == 0) {
+ invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&block_group->lock);
+ BTRFS_I(inode)->generation = 0;
+ }
+ kfree(checksums);
+ btrfs_update_inode(trans, root, inode);
+ iput(inode);
+ return ret;
+}
+
static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
u64 offset)
{
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011..e49ca5c321b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,24 @@ struct btrfs_free_space {
struct list_head list;
};
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_block_group_cache
+ *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int btrfs_write_out_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64f99cf69ce..558cac2dfa5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 num_bytes;
- u64 orig_start;
- u64 disk_num_bytes;
u64 blocksize = root->sectorsize;
u64 actual_end;
u64 isize = i_size_read(inode);
@@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode,
int i;
int will_compress;
- orig_start = start;
-
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -371,7 +367,6 @@ again:
total_compressed = min(total_compressed, max_uncompressed);
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
num_bytes = max(blocksize, num_bytes);
- disk_num_bytes = num_bytes;
total_in = 0;
ret = 0;
@@ -467,7 +462,6 @@ again:
if (total_compressed >= total_in) {
will_compress = 0;
} else {
- disk_num_bytes = total_compressed;
num_bytes = total_in;
}
}
@@ -757,20 +751,17 @@ static noinline int cow_file_range(struct inode *inode,
u64 disk_num_bytes;
u64 cur_alloc_size;
u64 blocksize = root->sectorsize;
- u64 actual_end;
- u64 isize = i_size_read(inode);
struct btrfs_key ins;
struct extent_map *em;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 0;
+ BUG_ON(root == root->fs_info->tree_root);
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- actual_end = min_t(u64, isize, end + 1);
-
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
num_bytes = max(blocksize, num_bytes);
disk_num_bytes = num_bytes;
@@ -1035,10 +1026,16 @@ static noinline int run_delalloc_nocow(struct inode *inode,
int type;
int nocow;
int check_prev = 1;
+ bool nolock = false;
path = btrfs_alloc_path();
BUG_ON(!path);
- trans = btrfs_join_transaction(root, 1);
+ if (root == root->fs_info->tree_root) {
+ nolock = true;
+ trans = btrfs_join_transaction_nolock(root, 1);
+ } else {
+ trans = btrfs_join_transaction(root, 1);
+ }
BUG_ON(!trans);
cow_start = (u64)-1;
@@ -1211,8 +1208,13 @@ out_check:
BUG_ON(ret);
}
- ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
+ if (nolock) {
+ ret = btrfs_end_transaction_nolock(trans, root);
+ BUG_ON(ret);
+ } else {
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ }
btrfs_free_path(path);
return 0;
}
@@ -1289,6 +1291,8 @@ static int btrfs_set_bit_hook(struct inode *inode,
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
+ int do_list = (root->root_key.objectid !=
+ BTRFS_ROOT_TREE_OBJECTID);
if (*bits & EXTENT_FIRST_DELALLOC)
*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1302,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += len;
root->fs_info->delalloc_bytes += len;
- if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+ if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
&root->fs_info->delalloc_inodes);
}
@@ -1321,6 +1325,8 @@ static int btrfs_clear_bit_hook(struct inode *inode,
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
+ int do_list = (root->root_key.objectid !=
+ BTRFS_ROOT_TREE_OBJECTID);
if (*bits & EXTENT_FIRST_DELALLOC)
*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1336,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
if (*bits & EXTENT_DO_ACCOUNTING)
btrfs_delalloc_release_metadata(inode, len);
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && do_list)
btrfs_free_reserved_data_space(inode, len);
spin_lock(&root->fs_info->delalloc_lock);
root->fs_info->delalloc_bytes -= len;
BTRFS_I(inode)->delalloc_bytes -= len;
- if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+ if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_del_init(&BTRFS_I(inode)->delalloc_inodes);
}
@@ -1372,7 +1379,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
if (map_length < length + size)
return 1;
- return 0;
+ return ret;
}
/*
@@ -1426,7 +1433,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ if (root == root->fs_info->tree_root)
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
+ else
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
BUG_ON(ret);
if (!(rw & REQ_WRITE)) {
@@ -1662,6 +1672,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
struct extent_state *cached_state = NULL;
int compressed = 0;
int ret;
+ bool nolock = false;
ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
end - start + 1);
@@ -1669,11 +1680,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
return 0;
BUG_ON(!ordered_extent);
+ nolock = (root == root->fs_info->tree_root);
+
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list));
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret) {
- trans = btrfs_join_transaction(root, 1);
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root, 1);
+ else
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
@@ -1686,7 +1703,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->file_offset + ordered_extent->len - 1,
0, &cached_state, GFP_NOFS);
- trans = btrfs_join_transaction(root, 1);
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root, 1);
+ else
+ trans = btrfs_join_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1700,6 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->len);
BUG_ON(ret);
} else {
+ BUG_ON(root == root->fs_info->tree_root);
ret = insert_reserved_file_extent(trans, inode,
ordered_extent->file_offset,
ordered_extent->start,
@@ -1724,9 +1745,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
out:
- btrfs_delalloc_release_metadata(inode, ordered_extent->len);
- if (trans)
- btrfs_end_transaction(trans, root);
+ if (nolock) {
+ if (trans)
+ btrfs_end_transaction_nolock(trans, root);
+ } else {
+ btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+ if (trans)
+ btrfs_end_transaction(trans, root);
+ }
+
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
@@ -2237,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
- struct btrfs_item *item;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
struct inode *inode;
@@ -2275,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
/* pull out the item */
leaf = path->nodes[0];
- item = btrfs_item_nr(leaf, path->slots[0]);
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
/* make sure the item matches what we want */
@@ -2651,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
dir, index);
- BUG_ON(ret);
+ if (ret == -ENOENT)
+ ret = 0;
err:
btrfs_free_path(path);
if (ret)
@@ -2672,8 +2698,8 @@ static int check_path_shared(struct btrfs_root *root,
{
struct extent_buffer *eb;
int level;
- int ret;
u64 refs = 1;
+ int uninitialized_var(ret);
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
if (!path->nodes[level])
@@ -2686,7 +2712,7 @@ static int check_path_shared(struct btrfs_root *root,
if (refs > 1)
return 1;
}
- return 0;
+ return ret; /* XXX callers? */
}
/*
@@ -3196,7 +3222,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
- if (root->ref_cows)
+ if (root->ref_cows || root == root->fs_info->tree_root)
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
path = btrfs_alloc_path();
@@ -3344,7 +3370,8 @@ delete:
} else {
break;
}
- if (found_extent && root->ref_cows) {
+ if (found_extent && (root->ref_cows ||
+ root == root->fs_info->tree_root)) {
btrfs_set_path_blocking(path);
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
@@ -3675,7 +3702,8 @@ void btrfs_evict_inode(struct inode *inode)
int ret;
truncate_inode_pages(&inode->i_data, 0);
- if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+ if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+ root == root->fs_info->tree_root))
goto no_delete;
if (is_bad_inode(inode)) {
@@ -3888,7 +3916,14 @@ static void inode_tree_del(struct inode *inode)
}
spin_unlock(&root->inode_lock);
- if (empty && btrfs_root_refs(&root->root_item) == 0) {
+ /*
+ * Free space cache has inodes in the tree root, but the tree root has a
+ * root_refs of 0, so this could end up dropping the tree root as a
+ * snapshot, so we need the extra !root->fs_info->tree_root check to
+ * make sure we don't drop it.
+ */
+ if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+ root != root->fs_info->tree_root) {
synchronize_srcu(&root->fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -4282,14 +4317,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
int ret = 0;
+ bool nolock = false;
if (BTRFS_I(inode)->dummy_inode)
return 0;
+ smp_mb();
+ nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
+
if (wbc->sync_mode == WB_SYNC_ALL) {
- trans = btrfs_join_transaction(root, 1);
+ if (nolock)
+ trans = btrfs_join_transaction_nolock(root, 1);
+ else
+ trans = btrfs_join_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
- ret = btrfs_commit_transaction(trans, root);
+ if (nolock)
+ ret = btrfs_end_transaction_nolock(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
}
return ret;
}
@@ -5645,7 +5690,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_dio_private *dip;
struct bio_vec *bvec = bio->bi_io_vec;
- u64 start;
int skip_sum;
int write = rw & REQ_WRITE;
int ret = 0;
@@ -5671,7 +5715,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
dip->inode = inode;
dip->logical_offset = file_offset;
- start = dip->logical_offset;
dip->bytes = 0;
do {
dip->bytes += bvec->bv_len;
@@ -6308,6 +6351,21 @@ void btrfs_destroy_inode(struct inode *inode)
spin_unlock(&root->fs_info->ordered_extent_lock);
}
+ if (root == root->fs_info->tree_root) {
+ struct btrfs_block_group_cache *block_group;
+
+ block_group = btrfs_lookup_block_group(root->fs_info,
+ BTRFS_I(inode)->block_group);
+ if (block_group && block_group->inode == inode) {
+ spin_lock(&block_group->lock);
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+ } else if (block_group) {
+ btrfs_put_block_group(block_group);
+ }
+ }
+
spin_lock(&root->orphan_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
@@ -6340,7 +6398,8 @@ int btrfs_drop_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (btrfs_root_refs(&root->root_item) == 0 &&
+ root != root->fs_info->tree_root)
return 1;
else
return generic_drop_inode(inode);
@@ -6609,7 +6668,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return 0;
}
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+ int sync)
{
struct btrfs_inode *binode;
struct inode *inode = NULL;
@@ -6631,7 +6691,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
spin_unlock(&root->fs_info->delalloc_lock);
if (inode) {
- write_inode_now(inode, 0);
+ if (sync) {
+ filemap_write_and_wait(inode->i_mapping);
+ /*
+ * We have to do this because compression doesn't
+ * actually set PG_writeback until it submits the pages
+ * for IO, which happens in an async thread, so we could
+ * race and not actually wait for any writeback pages
+ * because they've not been submitted yet. Technically
+ * this could still be the case for the ordered stuff
+ * since the async thread may not have started to do its
+ * work yet. If this becomes the case then we need to
+ * figure out a way to make sure that in writepage we
+ * wait for any async pages to be submitted before
+ * returning so that fdatawait does what its supposed to
+ * do.
+ */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ } else {
+ filemap_flush(inode->i_mapping);
+ }
if (delay_iput)
btrfs_add_delayed_iput(inode);
else
@@ -6757,27 +6836,33 @@ out_unlock:
return err;
}
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
- u64 start, u64 num_bytes, u64 min_size,
- loff_t actual_len, u64 *alloc_hint)
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint,
+ struct btrfs_trans_handle *trans)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
int ret = 0;
+ bool own_trans = true;
+ if (trans)
+ own_trans = false;
while (num_bytes > 0) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
+ if (own_trans) {
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
}
ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
0, *alloc_hint, (u64)-1, &ins, 1);
if (ret) {
- btrfs_end_transaction(trans, root);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
break;
}
@@ -6810,11 +6895,30 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
+ if (own_trans)
+ btrfs_end_transaction(trans, root);
}
return ret;
}
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
+{
+ return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+ min_size, actual_len, alloc_hint,
+ NULL);
+}
+
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+ struct btrfs_trans_handle *trans, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
+{
+ return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+ min_size, actual_len, alloc_hint, trans);
+}
+
static long btrfs_fallocate(struct inode *inode, int mode,
loff_t offset, loff_t len)
{
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58db..463d91b4dd3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
static noinline int create_subvol(struct btrfs_root *root,
struct dentry *dentry,
- char *name, int namelen)
+ char *name, int namelen,
+ u64 *async_transid)
{
struct btrfs_trans_handle *trans;
struct btrfs_key key;
@@ -338,13 +339,19 @@ static noinline int create_subvol(struct btrfs_root *root,
d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
- err = btrfs_commit_transaction(trans, root);
+ if (async_transid) {
+ *async_transid = trans->transid;
+ err = btrfs_commit_transaction_async(trans, root, 1);
+ } else {
+ err = btrfs_commit_transaction(trans, root);
+ }
if (err && !ret)
ret = err;
return ret;
}
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+ char *name, int namelen, u64 *async_transid)
{
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -373,7 +380,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
- ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
+ if (async_transid) {
+ *async_transid = trans->transid;
+ ret = btrfs_commit_transaction_async(trans,
+ root->fs_info->extent_root, 1);
+ } else {
+ ret = btrfs_commit_transaction(trans,
+ root->fs_info->extent_root);
+ }
BUG_ON(ret);
ret = pending_snapshot->error;
@@ -395,6 +409,76 @@ fail:
return ret;
}
+/* copy of check_sticky in fs/namei.c()
+* It's inline, so penalty for filesystems that don't use sticky bit is
+* minimal.
+*/
+static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
+{
+ uid_t fsuid = current_fsuid();
+
+ if (!(dir->i_mode & S_ISVTX))
+ return 0;
+ if (inode->i_uid == fsuid)
+ return 0;
+ if (dir->i_uid == fsuid)
+ return 0;
+ return !capable(CAP_FOWNER);
+}
+
+/* copy of may_delete in fs/namei.c()
+ * Check whether we can remove a link victim from directory dir, check
+ * whether the type of victim is right.
+ * 1. We can't do it if dir is read-only (done in permission())
+ * 2. We should have write and exec permissions on dir
+ * 3. We can't remove anything from append-only dir
+ * 4. We can't do anything with immutable dir (done in permission())
+ * 5. If the sticky bit on dir is set we should either
+ * a. be owner of dir, or
+ * b. be owner of victim, or
+ * c. have CAP_FOWNER capability
+ * 6. If the victim is append-only or immutable we can't do antyhing with
+ * links pointing to it.
+ * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * nfs_async_unlink().
+ */
+
+static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+{
+ int error;
+
+ if (!victim->d_inode)
+ return -ENOENT;
+
+ BUG_ON(victim->d_parent->d_inode != dir);
+ audit_inode_child(victim, dir);
+
+ error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ if (error)
+ return error;
+ if (IS_APPEND(dir))
+ return -EPERM;
+ if (btrfs_check_sticky(dir, victim->d_inode)||
+ IS_APPEND(victim->d_inode)||
+ IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+ return -EPERM;
+ if (isdir) {
+ if (!S_ISDIR(victim->d_inode->i_mode))
+ return -ENOTDIR;
+ if (IS_ROOT(victim))
+ return -EBUSY;
+ } else if (S_ISDIR(victim->d_inode->i_mode))
+ return -EISDIR;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+ return -EBUSY;
+ return 0;
+}
+
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
{
@@ -412,7 +496,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
*/
static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
- struct btrfs_root *snap_src)
+ struct btrfs_root *snap_src,
+ u64 *async_transid)
{
struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
@@ -443,10 +528,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
if (snap_src) {
- error = create_snapshot(snap_src, dentry);
+ error = create_snapshot(snap_src, dentry,
+ name, namelen, async_transid);
} else {
error = create_subvol(BTRFS_I(dir)->root, dentry,
- name, namelen);
+ name, namelen, async_transid);
}
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -708,7 +794,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
char *sizestr;
char *devstr = NULL;
int ret = 0;
- int namelen;
int mod = 0;
if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +807,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- namelen = strlen(vol_args->name);
mutex_lock(&root->fs_info->volume_mutex);
sizestr = vol_args->name;
@@ -801,11 +885,13 @@ out_unlock:
return ret;
}
-static noinline int btrfs_ioctl_snap_create(struct file *file,
- void __user *arg, int subvol)
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+ char *name,
+ unsigned long fd,
+ int subvol,
+ u64 *transid)
{
struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
- struct btrfs_ioctl_vol_args *vol_args;
struct file *src_file;
int namelen;
int ret = 0;
@@ -813,23 +899,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
- vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
-
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- namelen = strlen(vol_args->name);
- if (strchr(vol_args->name, '/')) {
+ namelen = strlen(name);
+ if (strchr(name, '/')) {
ret = -EINVAL;
goto out;
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
- NULL);
+ ret = btrfs_mksubvol(&file->f_path, name, namelen,
+ NULL, transid);
} else {
struct inode *src_inode;
- src_file = fget(vol_args->fd);
+ src_file = fget(fd);
if (!src_file) {
ret = -EINVAL;
goto out;
@@ -843,12 +924,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
fput(src_file);
goto out;
}
- ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
- BTRFS_I(src_inode)->root);
+ ret = btrfs_mksubvol(&file->f_path, name, namelen,
+ BTRFS_I(src_inode)->root,
+ transid);
fput(src_file);
}
out:
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+ void __user *arg, int subvol,
+ int async)
+{
+ struct btrfs_ioctl_vol_args *vol_args = NULL;
+ struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+ char *name;
+ u64 fd;
+ u64 transid = 0;
+ int ret;
+
+ if (async) {
+ async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
+ if (IS_ERR(async_vol_args))
+ return PTR_ERR(async_vol_args);
+
+ name = async_vol_args->name;
+ fd = async_vol_args->fd;
+ async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+ } else {
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+ name = vol_args->name;
+ fd = vol_args->fd;
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ }
+
+ ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+ subvol, &transid);
+
+ if (!ret && async) {
+ if (copy_to_user(arg +
+ offsetof(struct btrfs_ioctl_async_vol_args,
+ transid), &transid, sizeof(transid)))
+ return -EFAULT;
+ }
+
kfree(vol_args);
+ kfree(async_vol_args);
+
return ret;
}
@@ -1073,14 +1198,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- args = kmalloc(sizeof(*args), GFP_KERNEL);
- if (!args)
- return -ENOMEM;
+ args = memdup_user(argp, sizeof(*args));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
- if (copy_from_user(args, argp, sizeof(*args))) {
- kfree(args);
- return -EFAULT;
- }
inode = fdentry(file)->d_inode;
ret = search_ioctl(inode, args);
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1188,14 +1309,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- args = kmalloc(sizeof(*args), GFP_KERNEL);
- if (!args)
- return -ENOMEM;
+ args = memdup_user(argp, sizeof(*args));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
- if (copy_from_user(args, argp, sizeof(*args))) {
- kfree(args);
- return -EFAULT;
- }
inode = fdentry(file)->d_inode;
if (args->treeid == 0)
@@ -1227,9 +1344,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
int ret;
int err = 0;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
@@ -1259,13 +1373,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
inode = dentry->d_inode;
+ dest = BTRFS_I(inode)->root;
+ if (!capable(CAP_SYS_ADMIN)){
+ /*
+ * Regular user. Only allow this with a special mount
+ * option, when the user has write+exec access to the
+ * subvol root, and when rmdir(2) would have been
+ * allowed.
+ *
+ * Note that this is _not_ check that the subvol is
+ * empty or doesn't contain data that we wouldn't
+ * otherwise be able to delete.
+ *
+ * Users who want to delete empty subvols should try
+ * rmdir(2).
+ */
+ err = -EPERM;
+ if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ goto out_dput;
+
+ /*
+ * Do not allow deletion if the parent dir is the same
+ * as the dir to be deleted. That means the ioctl
+ * must be called on the dentry referencing the root
+ * of the subvol, not a random directory contained
+ * within it.
+ */
+ err = -EINVAL;
+ if (root == dest)
+ goto out_dput;
+
+ err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+ if (err)
+ goto out_dput;
+
+ /* check if subvolume may be deleted by a non-root user */
+ err = btrfs_may_delete(dir, dentry, 1);
+ if (err)
+ goto out_dput;
+ }
+
if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
}
- dest = BTRFS_I(inode)->root;
-
mutex_lock(&inode->i_mutex);
err = d_invalidate(dentry);
if (err)
@@ -1304,7 +1456,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
BUG_ON(ret);
}
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_end_transaction(trans, root);
BUG_ON(ret);
inode->i_flags |= S_DEAD;
out_up_write:
@@ -1502,11 +1654,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
path->reada = 2;
if (inode < src) {
- mutex_lock(&inode->i_mutex);
- mutex_lock(&src->i_mutex);
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
} else {
- mutex_lock(&src->i_mutex);
- mutex_lock(&inode->i_mutex);
+ mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
}
/* determine range to clone */
@@ -1530,13 +1682,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
- ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
- if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+ ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+ if (!ordered &&
+ !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
+ EXTENT_DELALLOC, 0, NULL))
break;
unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
if (ordered)
btrfs_put_ordered_extent(ordered);
- btrfs_wait_ordered_range(src, off, off+len);
+ btrfs_wait_ordered_range(src, off, len);
}
/* clone data */
@@ -1605,7 +1759,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
btrfs_release_path(root, path);
- if (key.offset + datal < off ||
+ if (key.offset + datal <= off ||
key.offset >= off+len)
goto next;
@@ -1879,6 +2033,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
return 0;
}
+static void get_block_group_info(struct list_head *groups_list,
+ struct btrfs_ioctl_space_info *space)
+{
+ struct btrfs_block_group_cache *block_group;
+
+ space->total_bytes = 0;
+ space->used_bytes = 0;
+ space->flags = 0;
+ list_for_each_entry(block_group, groups_list, list) {
+ space->flags = block_group->flags;
+ space->total_bytes += block_group->key.offset;
+ space->used_bytes +=
+ btrfs_block_group_used(&block_group->item);
+ }
+}
+
long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_space_args space_args;
@@ -1887,27 +2057,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
struct btrfs_ioctl_space_info *dest_orig;
struct btrfs_ioctl_space_info *user_dest;
struct btrfs_space_info *info;
+ u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+ int num_types = 4;
int alloc_size;
int ret = 0;
int slot_count = 0;
+ int i, c;
if (copy_from_user(&space_args,
(struct btrfs_ioctl_space_args __user *)arg,
sizeof(space_args)))
return -EFAULT;
- /* first we count slots */
- rcu_read_lock();
- list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
- slot_count++;
- rcu_read_unlock();
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ info = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ list) {
+ if (tmp->flags == types[i]) {
+ info = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!info)
+ continue;
+
+ down_read(&info->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&info->block_groups[c]))
+ slot_count++;
+ }
+ up_read(&info->groups_sem);
+ }
/* space_slots == 0 means they are asking for a count */
if (space_args.space_slots == 0) {
space_args.total_spaces = slot_count;
goto out;
}
+
+ slot_count = min_t(int, space_args.space_slots, slot_count);
+
alloc_size = sizeof(*dest) * slot_count;
+
/* we generally have at most 6 or so space infos, one for each raid
* level. So, a whole page should be more than enough for everyone
*/
@@ -1921,27 +2120,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
dest_orig = dest;
/* now we have a buffer to copy into */
- rcu_read_lock();
- list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
- /* make sure we don't copy more than we allocated
- * in our buffer
- */
- if (slot_count == 0)
- break;
- slot_count--;
-
- /* make sure userland has enough room in their buffer */
- if (space_args.total_spaces >= space_args.space_slots)
- break;
+ for (i = 0; i < num_types; i++) {
+ struct btrfs_space_info *tmp;
+
+ info = NULL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ list) {
+ if (tmp->flags == types[i]) {
+ info = tmp;
+ break;
+ }
+ }
+ rcu_read_unlock();
- space.flags = info->flags;
- space.total_bytes = info->total_bytes;
- space.used_bytes = info->bytes_used;
- memcpy(dest, &space, sizeof(space));
- dest++;
- space_args.total_spaces++;
+ if (!info)
+ continue;
+ down_read(&info->groups_sem);
+ for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+ if (!list_empty(&info->block_groups[c])) {
+ get_block_group_info(&info->block_groups[c],
+ &space);
+ memcpy(dest, &space, sizeof(space));
+ dest++;
+ space_args.total_spaces++;
+ }
+ }
+ up_read(&info->groups_sem);
}
- rcu_read_unlock();
user_dest = (struct btrfs_ioctl_space_info *)
(arg + sizeof(struct btrfs_ioctl_space_args));
@@ -1984,6 +2190,36 @@ long btrfs_ioctl_trans_end(struct file *file)
return 0;
}
+static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+{
+ struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 transid;
+
+ trans = btrfs_start_transaction(root, 0);
+ transid = trans->transid;
+ btrfs_commit_transaction_async(trans, root, 0);
+
+ if (argp)
+ if (copy_to_user(argp, &transid, sizeof(transid)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+{
+ struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+ u64 transid;
+
+ if (argp) {
+ if (copy_from_user(&transid, argp, sizeof(transid)))
+ return -EFAULT;
+ } else {
+ transid = 0; /* current trans */
+ }
+ return btrfs_wait_for_commit(root, transid);
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -1998,9 +2234,11 @@ long btrfs_ioctl(struct file *file, unsigned int
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case BTRFS_IOC_SNAP_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 0);
+ return btrfs_ioctl_snap_create(file, argp, 0, 0);
+ case BTRFS_IOC_SNAP_CREATE_ASYNC:
+ return btrfs_ioctl_snap_create(file, argp, 0, 1);
case BTRFS_IOC_SUBVOL_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 1);
+ return btrfs_ioctl_snap_create(file, argp, 1, 0);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -2034,6 +2272,10 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC:
btrfs_sync_fs(file->f_dentry->d_sb, 1);
return 0;
+ case BTRFS_IOC_START_SYNC:
+ return btrfs_ioctl_start_sync(file, argp);
+ case BTRFS_IOC_WAIT_SYNC:
+ return btrfs_ioctl_wait_sync(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517..17c99ebdf96 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,21 @@
#define BTRFS_IOCTL_MAGIC 0x94
#define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 4087
/* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
struct btrfs_ioctl_vol_args {
__s64 fd;
char name[BTRFS_PATH_NAME_MAX + 1];
};
+#define BTRFS_SNAPSHOT_NAME_MAX 4079
+struct btrfs_ioctl_async_vol_args {
+ __s64 fd;
+ __u64 transid;
+ char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+};
+
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args {
__u64 treeid;
@@ -178,4 +185,8 @@ struct btrfs_ioctl_space_args {
#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
+ struct btrfs_ioctl_async_vol_args)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5ad..f4621f6deca 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -526,7 +526,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
u64 end;
u64 orig_end;
- u64 wait_end;
struct btrfs_ordered_extent *ordered;
int found;
@@ -537,7 +536,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
if (orig_end > INT_LIMIT(loff_t))
orig_end = INT_LIMIT(loff_t);
}
- wait_end = orig_end;
again:
/* start IO across the range first to instantiate any delalloc
* extents
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4..045c9c2b2d7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,7 @@
#include "locking.h"
#include "btrfs_inode.h"
#include "async-thread.h"
+#include "free-space-cache.h"
/*
* backref_node, mapping_node and tree_block start with this
@@ -178,8 +179,6 @@ struct reloc_control {
u64 search_start;
u64 extents_found;
- int block_rsv_retries;
-
unsigned int stage:8;
unsigned int create_reloc_tree:1;
unsigned int merge_reloc_tree:1;
@@ -2133,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
LIST_HEAD(reloc_roots);
u64 num_bytes = 0;
int ret;
- int retries = 0;
mutex_lock(&root->fs_info->trans_mutex);
rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2143,7 +2141,7 @@ again:
if (!err) {
num_bytes = rc->merging_rsv_size;
ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
- num_bytes, &retries);
+ num_bytes);
if (ret)
err = ret;
}
@@ -2155,7 +2153,6 @@ again:
btrfs_end_transaction(trans, rc->extent_root);
btrfs_block_rsv_release(rc->extent_root,
rc->block_rsv, num_bytes);
- retries = 0;
goto again;
}
}
@@ -2405,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
- &rc->block_rsv_retries);
+ ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
if (ret) {
if (ret == -EAGAIN)
rc->commit_transaction = 1;
return ret;
}
- rc->block_rsv_retries = 0;
return 0;
}
@@ -3099,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc,
BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
ret = get_ref_objectid_v0(rc, path, extent_key,
&ref_owner, NULL);
+ if (ret < 0)
+ return ret;
BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
level = (int)ref_owner;
/* FIXME: get real generation */
@@ -3191,6 +3188,54 @@ static int block_use_full_backref(struct reloc_control *rc,
return ret;
}
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+ struct inode *inode, u64 ino)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_trans_handle *trans;
+ unsigned long nr;
+ int ret = 0;
+
+ if (inode)
+ goto truncate;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
+ if (inode && !IS_ERR(inode))
+ iput(inode);
+ return -ENOENT;
+ }
+
+truncate:
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ trans = btrfs_join_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ goto out;
+ }
+
+ ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+
+ btrfs_free_path(path);
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+out:
+ iput(inode);
+ return ret;
+}
+
/*
* helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
* this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc,
int counted;
int ret;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
ref_root = btrfs_extent_data_ref_root(leaf, ref);
ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
ref_count = btrfs_extent_data_ref_count(leaf, ref);
+ /*
+ * This is an extent belonging to the free space cache, lets just delete
+ * it and redo the search.
+ */
+ if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+ ret = delete_block_group_cache(rc->extent_root->fs_info,
+ NULL, ref_objectid);
+ if (ret != -ENOENT)
+ return ret;
+ ret = 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
root = read_fs_root(rc->extent_root->fs_info, ref_root);
if (IS_ERR(root)) {
err = PTR_ERR(root);
@@ -3554,8 +3611,7 @@ int prepare_to_relocate(struct reloc_control *rc)
* is no reservation in transaction handle.
*/
ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
- rc->extent_root->nodesize * 256,
- &rc->block_rsv_retries);
+ rc->extent_root->nodesize * 256);
if (ret)
return ret;
@@ -3567,7 +3623,6 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->extents_found = 0;
rc->nodes_relocated = 0;
rc->merging_rsv_size = 0;
- rc->block_rsv_retries = 0;
rc->create_reloc_tree = 1;
set_reloc_control(rc);
@@ -3860,6 +3915,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
{
struct btrfs_fs_info *fs_info = extent_root->fs_info;
struct reloc_control *rc;
+ struct inode *inode;
+ struct btrfs_path *path;
int ret;
int rw = 0;
int err = 0;
@@ -3882,6 +3939,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
rw = 1;
}
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+ path);
+ btrfs_free_path(path);
+
+ if (!IS_ERR(inode))
+ ret = delete_block_group_cache(fs_info, inode, 0);
+ else
+ ret = PTR_ERR(inode);
+
+ if (ret && ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+
rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
if (IS_ERR(rc->data_inode)) {
err = PTR_ERR(rc->data_inode);
@@ -4143,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
btrfs_add_ordered_sum(inode, ordered, sums);
}
btrfs_put_ordered_extent(ordered);
- return 0;
+ return ret;
}
void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c..6a1086e83ff 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
{
struct btrfs_root *dead_root;
- struct btrfs_item *item;
struct btrfs_root_item *ri;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -214,7 +213,6 @@ again:
nritems = btrfs_header_nritems(leaf);
slot = path->slots[0];
}
- item = btrfs_item_nr(leaf, slot);
btrfs_item_key_to_cpu(leaf, &key, slot);
if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
goto next;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ebe46c62874..8299a25ffc8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb)
ret = close_ctree(root);
sb->s_fs_info = NULL;
+
+ (void)ret; /* FIXME: need to fix VFS to return error? */
}
enum {
@@ -68,7 +70,8 @@ enum {
Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
- Opt_discard, Opt_err,
+ Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
+ Opt_user_subvol_rm_allowed,
};
static match_table_t tokens = {
@@ -92,6 +95,9 @@ static match_table_t tokens = {
{Opt_flushoncommit, "flushoncommit"},
{Opt_ratio, "metadata_ratio=%d"},
{Opt_discard, "discard"},
+ {Opt_space_cache, "space_cache"},
+ {Opt_clear_cache, "clear_cache"},
+ {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_err, NULL},
};
@@ -235,6 +241,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_discard:
btrfs_set_opt(info->mount_opt, DISCARD);
break;
+ case Opt_space_cache:
+ printk(KERN_INFO "btrfs: enabling disk space caching\n");
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ case Opt_clear_cache:
+ printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+ btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -380,7 +396,7 @@ static struct dentry *get_default_root(struct super_block *sb,
find_root:
new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
if (IS_ERR(new_root))
- return ERR_PTR(PTR_ERR(new_root));
+ return ERR_CAST(new_root);
if (btrfs_root_refs(&new_root->root_item) == 0)
return ERR_PTR(-ENOENT);
@@ -436,7 +452,6 @@ static int btrfs_fill_super(struct super_block *sb,
{
struct inode *inode;
struct dentry *root_dentry;
- struct btrfs_super_block *disk_super;
struct btrfs_root *tree_root;
struct btrfs_key key;
int err;
@@ -458,7 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
return PTR_ERR(tree_root);
}
sb->s_fs_info = tree_root;
- disk_super = &tree_root->fs_info->super_copy;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
@@ -571,7 +585,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
char *subvol_name = NULL;
u64 subvol_objectid = 0;
int error = 0;
- int found = 0;
if (!(flags & MS_RDONLY))
mode |= FMODE_WRITE;
@@ -607,7 +620,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
goto error_close_devices;
}
- found = 1;
btrfs_close_devices(fs_devices);
} else {
char b[BDEVNAME_SIZE];
@@ -629,7 +641,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (IS_ERR(root)) {
error = PTR_ERR(root);
deactivate_locked_super(s);
- goto error;
+ goto error_free_subvol_name;
}
/* if they gave us a subvolume name bind mount into that */
if (strcmp(subvol_name, ".")) {
@@ -643,14 +655,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
deactivate_locked_super(s);
error = PTR_ERR(new_root);
dput(root);
- goto error_close_devices;
+ goto error_free_subvol_name;
}
if (!new_root->d_inode) {
dput(root);
dput(new_root);
deactivate_locked_super(s);
error = -ENXIO;
- goto error_close_devices;
+ goto error_free_subvol_name;
}
dput(root);
root = new_root;
@@ -665,7 +677,6 @@ error_close_devices:
btrfs_close_devices(fs_devices);
error_free_subvol_name:
kfree(subvol_name);
-error:
return ERR_PTR(error);
}
@@ -713,18 +724,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
+ u64 total_used_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry_rcu(found, head, list) {
+ if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_SYSTEM))
+ total_used_data += found->disk_total;
+ else
+ total_used_data += found->disk_used;
total_used += found->disk_used;
+ }
rcu_read_unlock();
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bavail = buf->f_bfree;
+ buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63..1fffbc017bd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,6 +163,7 @@ enum btrfs_trans_type {
TRANS_START,
TRANS_JOIN,
TRANS_USERSPACE,
+ TRANS_JOIN_NOLOCK,
};
static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -179,14 +180,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
- int retries = 0;
int ret;
again:
h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h)
return ERR_PTR(-ENOMEM);
- mutex_lock(&root->fs_info->trans_mutex);
+ if (type != TRANS_JOIN_NOLOCK)
+ mutex_lock(&root->fs_info->trans_mutex);
if (may_wait_transaction(root, type))
wait_current_trans(root);
@@ -195,7 +196,8 @@ again:
cur_trans = root->fs_info->running_transaction;
cur_trans->use_count++;
- mutex_unlock(&root->fs_info->trans_mutex);
+ if (type != TRANS_JOIN_NOLOCK)
+ mutex_unlock(&root->fs_info->trans_mutex);
h->transid = cur_trans->transid;
h->transaction = cur_trans;
@@ -212,8 +214,7 @@ again:
}
if (num_items > 0) {
- ret = btrfs_trans_reserve_metadata(h, root, num_items,
- &retries);
+ ret = btrfs_trans_reserve_metadata(h, root, num_items);
if (ret == -EAGAIN) {
btrfs_commit_transaction(h, root);
goto again;
@@ -224,9 +225,11 @@ again:
}
}
- mutex_lock(&root->fs_info->trans_mutex);
+ if (type != TRANS_JOIN_NOLOCK)
+ mutex_lock(&root->fs_info->trans_mutex);
record_root_in_trans(h, root);
- mutex_unlock(&root->fs_info->trans_mutex);
+ if (type != TRANS_JOIN_NOLOCK)
+ mutex_unlock(&root->fs_info->trans_mutex);
if (!current->journal_info && type != TRANS_USERSPACE)
current->journal_info = h;
@@ -244,6 +247,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
return start_transaction(root, 0, TRANS_JOIN);
}
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+ int num_blocks)
+{
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+}
+
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int num_blocks)
{
@@ -270,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root,
return 0;
}
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+{
+ struct btrfs_transaction *cur_trans = NULL, *t;
+ int ret;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+
+ ret = 0;
+ if (transid) {
+ if (transid <= root->fs_info->last_trans_committed)
+ goto out_unlock;
+
+ /* find specified transaction */
+ list_for_each_entry(t, &root->fs_info->trans_list, list) {
+ if (t->transid == transid) {
+ cur_trans = t;
+ break;
+ }
+ if (t->transid > transid)
+ break;
+ }
+ ret = -EINVAL;
+ if (!cur_trans)
+ goto out_unlock; /* bad transid */
+ } else {
+ /* find newest transaction that is committing | committed */
+ list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+ list) {
+ if (t->in_commit) {
+ if (t->commit_done)
+ goto out_unlock;
+ cur_trans = t;
+ break;
+ }
+ }
+ if (!cur_trans)
+ goto out_unlock; /* nothing committing|committed */
+ }
+
+ cur_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ wait_for_commit(root, cur_trans);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ put_transaction(cur_trans);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return ret;
+}
+
#if 0
/*
* rate limit against the drop_snapshot code. This helps to slow down new
@@ -348,7 +409,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int throttle)
+ struct btrfs_root *root, int throttle, int lock)
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
@@ -376,26 +437,29 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
- if (!root->fs_info->open_ioctl_trans &&
+ if (lock && !root->fs_info->open_ioctl_trans &&
should_end_transaction(trans, root))
trans->transaction->blocked = 1;
- if (cur_trans->blocked && !cur_trans->in_commit) {
+ if (lock && cur_trans->blocked && !cur_trans->in_commit) {
if (throttle)
return btrfs_commit_transaction(trans, root);
else
wake_up_process(info->transaction_kthread);
}
- mutex_lock(&info->trans_mutex);
+ if (lock)
+ mutex_lock(&info->trans_mutex);
WARN_ON(cur_trans != info->running_transaction);
WARN_ON(cur_trans->num_writers < 1);
cur_trans->num_writers--;
+ smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
- mutex_unlock(&info->trans_mutex);
+ if (lock)
+ mutex_unlock(&info->trans_mutex);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -411,13 +475,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- return __btrfs_end_transaction(trans, root, 0);
+ return __btrfs_end_transaction(trans, root, 0, 1);
}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- return __btrfs_end_transaction(trans, root, 1);
+ return __btrfs_end_transaction(trans, root, 1, 1);
+}
+
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 0, 0);
}
/*
@@ -836,7 +906,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct extent_buffer *tmp;
struct extent_buffer *old;
int ret;
- int retries = 0;
u64 to_reserve = 0;
u64 index = 0;
u64 objectid;
@@ -858,7 +927,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (to_reserve > 0) {
ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
- to_reserve, &retries);
+ to_reserve);
if (ret) {
pending->error = ret;
goto fail;
@@ -966,6 +1035,8 @@ static void update_super_roots(struct btrfs_root *root)
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
+ if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+ super->cache_generation = root_item->generation;
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -988,11 +1059,127 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
return ret;
}
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+ struct btrfs_transaction *trans)
+{
+ DEFINE_WAIT(wait);
+
+ if (trans->in_commit)
+ return;
+
+ while (1) {
+ prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (trans->in_commit) {
+ finish_wait(&root->fs_info->transaction_blocked_wait,
+ &wait);
+ break;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+ schedule();
+ mutex_lock(&root->fs_info->trans_mutex);
+ finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
+ }
+}
+
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+ struct btrfs_transaction *trans)
+{
+ DEFINE_WAIT(wait);
+
+ if (trans->commit_done || (trans->in_commit && !trans->blocked))
+ return;
+
+ while (1) {
+ prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (trans->commit_done ||
+ (trans->in_commit && !trans->blocked)) {
+ finish_wait(&root->fs_info->transaction_wait,
+ &wait);
+ break;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+ schedule();
+ mutex_lock(&root->fs_info->trans_mutex);
+ finish_wait(&root->fs_info->transaction_wait,
+ &wait);
+ }
+}
+
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+ struct btrfs_trans_handle *newtrans;
+ struct btrfs_root *root;
+ struct delayed_work work;
+};
+
+static void do_async_commit(struct work_struct *work)
+{
+ struct btrfs_async_commit *ac =
+ container_of(work, struct btrfs_async_commit, work.work);
+
+ btrfs_commit_transaction(ac->newtrans, ac->root);
+ kfree(ac);
+}
+
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int wait_for_unblock)
+{
+ struct btrfs_async_commit *ac;
+ struct btrfs_transaction *cur_trans;
+
+ ac = kmalloc(sizeof(*ac), GFP_NOFS);
+ BUG_ON(!ac);
+
+ INIT_DELAYED_WORK(&ac->work, do_async_commit);
+ ac->root = root;
+ ac->newtrans = btrfs_join_transaction(root, 0);
+
+ /* take transaction reference */
+ mutex_lock(&root->fs_info->trans_mutex);
+ cur_trans = trans->transaction;
+ cur_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ btrfs_end_transaction(trans, root);
+ schedule_delayed_work(&ac->work, 0);
+
+ /* wait for transaction to start and unblock */
+ mutex_lock(&root->fs_info->trans_mutex);
+ if (wait_for_unblock)
+ wait_current_trans_commit_start_and_unblock(root, cur_trans);
+ else
+ wait_current_trans_commit_start(root, cur_trans);
+ put_transaction(cur_trans);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ return 0;
+}
+
+/*
+ * btrfs_transaction state sequence:
+ * in_commit = 0, blocked = 0 (initial)
+ * in_commit = 1, blocked = 1
+ * blocked = 0
+ * commit_done = 1
+ */
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
unsigned long joined = 0;
- unsigned long timeout = 1;
struct btrfs_transaction *cur_trans;
struct btrfs_transaction *prev_trans = NULL;
DEFINE_WAIT(wait);
@@ -1039,6 +1226,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
@@ -1063,11 +1252,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
snap_pending = 1;
WARN_ON(cur_trans != trans->transaction);
- if (cur_trans->num_writers > 1)
- timeout = MAX_SCHEDULE_TIMEOUT;
- else if (should_grow)
- timeout = 1;
-
mutex_unlock(&root->fs_info->trans_mutex);
if (flush_on_commit || snap_pending) {
@@ -1089,8 +1273,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
TASK_UNINTERRUPTIBLE);
smp_mb();
- if (cur_trans->num_writers > 1 || should_grow)
- schedule_timeout(timeout);
+ if (cur_trans->num_writers > 1)
+ schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+ else if (should_grow)
+ schedule_timeout(1);
mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&cur_trans->writer_wait, &wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bf..f104b57ad4e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -87,12 +87,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+ int num_blocks);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int num_blocks);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -104,6 +109,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
int btrfs_clean_old_snapshots(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int wait_for_unblock);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed..992ab425599 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int ret = 0;
int wret;
int level;
- int orig_level;
int is_extent = 0;
int next_key_ret = 0;
u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
return -ENOMEM;
level = btrfs_header_level(root->node);
- orig_level = level;
if (level == 0)
goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9..a29f19384a2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
{
struct inode *dir;
int ret;
- struct btrfs_key location;
struct btrfs_inode_ref *ref;
struct btrfs_dir_item *di;
struct inode *inode;
@@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
unsigned long ref_ptr;
unsigned long ref_end;
- location.objectid = key->objectid;
- location.type = BTRFS_INODE_ITEM_KEY;
- location.offset = 0;
-
/*
* it is possible that we didn't log all the parent directories
* for a given inode. If we don't find the dir, just don't
@@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
- u32 item_size;
int level;
int i;
int ret;
@@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
nritems = btrfs_header_nritems(eb);
for (i = 0; i < nritems; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
- item_size = btrfs_item_size_nr(eb, i);
/* inode keys are done during the first stage */
if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
u64 root_owner;
- u64 root_gen;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
@@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
@@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
u64 root_owner;
- u64 root_gen;
int i;
int slot;
int ret;
@@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
slot = path->slots[i];
if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
- struct extent_buffer *node;
- node = path->nodes[i];
path->slots[i]++;
*level = i;
WARN_ON(*level == 0);
@@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
parent = path->nodes[*level + 1];
root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]));
if (wc->free) {
@@ -2273,7 +2260,7 @@ fail:
}
btrfs_end_log_trans(root);
- return 0;
+ return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
@@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
- u32 size;
int err = 0;
int ret;
int nritems;
@@ -2793,7 +2779,6 @@ again:
break;
src = path->nodes[0];
- size = btrfs_item_size_nr(src, path->slots[0]);
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
goto next_slot;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e25e46a8b4e..cc04dc1445d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1898,7 +1898,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
u64 size_to_free;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_chunk *chunk;
struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
struct btrfs_trans_handle *trans;
struct btrfs_key found_key;
@@ -1962,9 +1961,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (found_key.objectid != key.objectid)
break;
- chunk = btrfs_item_ptr(path->nodes[0],
- path->slots[0],
- struct btrfs_chunk);
/* chunk zero is special */
if (found_key.offset == 0)
break;
@@ -3031,8 +3027,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
}
bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
dev = multi->stripes[dev_nr].dev;
- BUG_ON(rw == WRITE && !dev->writeable);
- if (dev && dev->bdev) {
+ if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
bio->bi_bdev = dev->bdev;
if (async_submit)
schedule_bio(root, dev, rw, bio);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb21587..698fdd2c739 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
- struct btrfs_item *item;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
int ret = 0, slot, advance;
@@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
advance = 1;
- item = btrfs_item_nr(leaf, slot);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
/* check to make sure this item is what we want */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa23..b9cd5445f71 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
int nr_pages = 0;
struct page *in_page = NULL;
struct page *out_page = NULL;
- int out_written = 0;
- int in_read = 0;
unsigned long bytes_left;
*out_pages = 0;
@@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
- out_written = 0;
- in_read = 0;
-
while (workspace->def_strm.total_in < len) {
ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f027382b54b..3d06ccc953a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1081,30 +1081,42 @@ static void wait_sb_inodes(struct super_block *sb)
}
/**
- * writeback_inodes_sb - writeback dirty inodes from given super_block
+ * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
* @sb: the superblock
+ * @nr: the number of pages to write
*
* Start writeback on some inodes on this super_block. No guarantees are made
* on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO. The number of pages submitted is
- * returned.
+ * for IO completion of submitted IO.
*/
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
{
DECLARE_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_NONE,
.done = &done,
+ .nr_pages = nr,
};
WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
- work.nr_pages = get_nr_dirty_pages();
-
bdi_queue_work(sb->s_bdi, &work);
wait_for_completion(&done);
}
+EXPORT_SYMBOL(writeback_inodes_sb_nr);
+
+/**
+ * writeback_inodes_sb - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+ return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+}
EXPORT_SYMBOL(writeback_inodes_sb);
/**
@@ -1127,6 +1139,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb)
EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
/**
+ * writeback_inodes_sb_if_idle - start writeback if none underway
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
+ unsigned long nr)
+{
+ if (!writeback_in_progress(sb->s_bdi)) {
+ down_read(&sb->s_umount);
+ writeback_inodes_sb_nr(sb, nr);
+ up_read(&sb->s_umount);
+ return 1;
+ } else
+ return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+
+/**
* sync_inodes_sb - sync sb inode pages
* @sb: the superblock
*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 09eec350054..0ead399e08b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -58,7 +58,9 @@ struct writeback_control {
struct bdi_writeback;
int inode_wait(void *);
void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
int writeback_inodes_sb_if_idle(struct super_block *);
+int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
void sync_inodes_sb(struct super_block *);
void writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc);