aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/extents_status.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-26 14:52:45 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-26 14:52:45 -0800
commit6515925b8259549b7f2187e25d3260306e3e85e5 (patch)
tree7d51487f308f8f0ac95d3113606c39ba592111ba /fs/ext4/extents_status.c
parentbbbd27e694ce2c5fde9c8fcedbea618dd9153fe7 (diff)
parent304e220f0879198b1f5309ad6f0be862b4009491 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Theodore Ts'o: "The one new feature added in this patch series is the ability to use the "punch hole" functionality for inodes that are not using extent maps. In the bug fix category, we fixed some races in the AIO and fstrim code, and some potential NULL pointer dereferences and memory leaks in error handling code paths. In the optimization category, we fixed a performance regression in the jbd2 layer introduced by commit d9b01934d56a ("jbd: fix fsync() tid wraparound bug", introduced in v3.0) which shows up in the AIM7 benchmark. We also further optimized jbd2 by minimize the amount of time that transaction handles are held active. This patch series also features some additional enhancement of the extent status tree, which is now used to cache extent information in a more efficient/compact form than what we use on-disk." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (65 commits) ext4: fix free clusters calculation in bigalloc filesystem ext4: no need to remove extent if len is 0 in ext4_es_remove_extent() ext4: fix xattr block allocation/release with bigalloc ext4: reclaim extents from extent status tree ext4: adjust some functions for reclaiming extents from extent status tree ext4: remove single extent cache ext4: lookup block mapping in extent status tree ext4: track all extent status in extent status tree ext4: let ext4_ext_map_blocks return EXT4_MAP_UNWRITTEN flag ext4: rename and improbe ext4_es_find_extent() ext4: add physical block and status member into extent status tree ext4: refine extent status tree ext4: use ERR_PTR() abstraction for ext4_append() ext4: refactor code to read directory blocks into ext4_read_dirblock() ext4: add debugging context for warning in ext4_da_update_reserve_space() ext4: use KERN_WARNING for warning messages jbd2: use module parameters instead of debugfs for jbd_debug ext4: use module parameters instead of debugfs for mballoc_debug ext4: start handle at the last possible moment when creating inodes ext4: fix the number of credits needed for acl ops with inline data ...
Diffstat (limited to 'fs/ext4/extents_status.c')
-rw-r--r--fs/ext4/extents_status.c631
1 files changed, 467 insertions, 164 deletions
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 564d981a2fc..f768f4a98a2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -23,40 +23,53 @@
* (e.g. Reservation space warning), and provide extent-level locking.
* Delay extent tree is the first step to achieve this goal. It is
* original built by Yongqiang Yang. At that time it is called delay
- * extent tree, whose goal is only track delay extent in memory to
+ * extent tree, whose goal is only track delayed extents in memory to
* simplify the implementation of fiemap and bigalloc, and introduce
* lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
- * delay extent tree at the following comment. But for better
- * understand what it does, it has been rename to extent status tree.
+ * delay extent tree at the first commit. But for better understand
+ * what it does, it has been rename to extent status tree.
*
- * Currently the first step has been done. All delay extents are
- * tracked in the tree. It maintains the delay extent when a delay
- * allocation is issued, and the delay extent is written out or
+ * Step1:
+ * Currently the first step has been done. All delayed extents are
+ * tracked in the tree. It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
* invalidated. Therefore the implementation of fiemap and bigalloc
* are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
*
* The following comment describes the implemenmtation of extent
* status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree. Hence, single extent cache can be removed
+ * because extent status tree can do a better job. Extents in status
+ * tree are loaded on-demand. Therefore, the extent status tree may not
+ * contain all of the extents in a file. Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory. written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure. Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
*/
/*
- * extents status tree implementation for ext4.
+ * Extent status tree implementation for ext4.
*
*
* ==========================================================================
- * Extents status encompass delayed extents and extent locks
+ * Extent status tree tracks all extent status.
*
- * 1. Why delayed extent implementation ?
+ * 1. Why we need to implement extent status tree?
*
- * Without delayed extent, ext4 identifies a delayed extent by looking
+ * Without extent status tree, ext4 identifies a delayed extent by looking
* up page cache, this has several deficiencies - complicated, buggy,
* and inefficient code.
*
- * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
- * to know if a block or a range of blocks are belonged to a delayed
- * extent.
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
*
- * Let us have a look at how they do without delayed extents implementation.
+ * Let us have a look at how they do without extent status tree.
* -- FIEMAP
* FIEMAP looks up page cache to identify delayed allocations from holes.
*
@@ -68,47 +81,48 @@
* already under delayed allocation or not to determine whether
* quota reserving is needed for the cluster.
*
- * -- punch hole
- * punch hole looks up page cache to identify a delayed extent.
- *
* -- writeout
* Writeout looks up whole page cache to see if a buffer is
* mapped, If there are not very many delayed buffers, then it is
* time comsuming.
*
- * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
* bigalloc and writeout can figure out if a block or a range of
* blocks is under delayed allocation(belonged to a delayed extent) or
- * not by searching the delayed extent tree.
+ * not by searching the extent tree.
*
*
* ==========================================================================
- * 2. ext4 delayed extents impelmentation
+ * 2. Ext4 extent status tree impelmentation
+ *
+ * -- extent
+ * A extent is a range of blocks which are contiguous logically and
+ * physically. Unlike extent in extent tree, this extent in ext4 is
+ * a in-memory struct, there is no corresponding on-disk data. There
+ * is no limit on length of extent, so an extent can contain as many
+ * blocks as they are contiguous logically and physically.
*
- * -- delayed extent
- * A delayed extent is a range of blocks which are contiguous
- * logically and under delayed allocation. Unlike extent in
- * ext4, delayed extent in ext4 is a in-memory struct, there is
- * no corresponding on-disk data. There is no limit on length of
- * delayed extent, so a delayed extent can contain as many blocks
- * as they are contiguous logically.
+ * -- extent status tree
+ * Every inode has an extent status tree and all allocation blocks
+ * are added to the tree with different status. The extent in the
+ * tree are ordered by logical block no.
*
- * -- delayed extent tree
- * Every inode has a delayed extent tree and all under delayed
- * allocation blocks are added to the tree as delayed extents.
- * Delayed extents in the tree are ordered by logical block no.
+ * -- operations on a extent status tree
+ * There are three important operations on a delayed extent tree: find
+ * next extent, adding a extent(a range of blocks) and removing a extent.
*
- * -- operations on a delayed extent tree
- * There are three operations on a delayed extent tree: find next
- * delayed extent, adding a space(a range of blocks) and removing
- * a space.
+ * -- race on a extent status tree
+ * Extent status tree is protected by inode->i_es_lock.
*
- * -- race on a delayed extent tree
- * Delayed extent tree is protected inode->i_es_lock.
+ * -- memory consumption
+ * Fragmented extent tree will make extent status tree cost too much
+ * memory. Hence, we will reclaim written/unwritten/hole extents from
+ * the tree under a heavy memory pressure.
*
*
* ==========================================================================
- * 3. performance analysis
+ * 3. Performance analysis
+ *
* -- overhead
* 1. There is a cache extent for write access, so if writes are
* not very random, adding space operaions are in O(1) time.
@@ -120,15 +134,21 @@
*
* ==========================================================================
* 4. TODO list
- * -- Track all extent status
*
- * -- Improve get block process
+ * -- Refactor delayed space reservation
*
* -- Extent-level locking
*/
static struct kmem_cache *ext4_es_cachep;
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan);
+static int ext4_es_reclaim_extents_count(struct super_block *sb);
+
int __init ext4_init_es(void)
{
ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
@@ -161,7 +181,9 @@ static void ext4_es_print_tree(struct inode *inode)
while (node) {
struct extent_status *es;
es = rb_entry(node, struct extent_status, rb_node);
- printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+ printk(KERN_DEBUG " [%u/%u) %llu %llx",
+ es->es_lblk, es->es_len,
+ ext4_es_pblock(es), ext4_es_status(es));
node = rb_next(node);
}
printk(KERN_DEBUG "\n");
@@ -170,10 +192,10 @@ static void ext4_es_print_tree(struct inode *inode)
#define ext4_es_print_tree(inode)
#endif
-static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
- BUG_ON(es->start + es->len < es->start);
- return es->start + es->len - 1;
+ BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+ return es->es_lblk + es->es_len - 1;
}
/*
@@ -181,25 +203,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es)
* it can't be found, try to find next extent.
*/
static struct extent_status *__es_tree_search(struct rb_root *root,
- ext4_lblk_t offset)
+ ext4_lblk_t lblk)
{
struct rb_node *node = root->rb_node;
struct extent_status *es = NULL;
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
- if (offset < es->start)
+ if (lblk < es->es_lblk)
node = node->rb_left;
- else if (offset > extent_status_end(es))
+ else if (lblk > ext4_es_end(es))
node = node->rb_right;
else
return es;
}
- if (es && offset < es->start)
+ if (es && lblk < es->es_lblk)
return es;
- if (es && offset > extent_status_end(es)) {
+ if (es && lblk > ext4_es_end(es)) {
node = rb_next(&es->rb_node);
return node ? rb_entry(node, struct extent_status, rb_node) :
NULL;
@@ -209,79 +231,121 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
- * ext4_es_find_extent: find the 1st delayed extent covering @es->start
- * if it exists, otherwise, the next extent after @es->start.
+ * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk
+ * if it exists, otherwise, the next extent after @es->lblk.
*
* @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
* @es: delayed extent that we found
- *
- * Returns the first block of the next extent after es, otherwise
- * EXT_MAX_BLOCKS if no delay extent is found.
- * Delayed extent is returned via @es.
*/
-ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
struct rb_node *node;
- ext4_lblk_t ret = EXT_MAX_BLOCKS;
- trace_ext4_es_find_extent_enter(inode, es->start);
+ BUG_ON(es == NULL);
+ trace_ext4_es_find_delayed_extent_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
- /* find delay extent in cache firstly */
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
if (tree->cache_es) {
es1 = tree->cache_es;
- if (in_range(es->start, es1->start, es1->len)) {
- es_debug("%u cached by [%u/%u)\n",
- es->start, es1->start, es1->len);
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %llx\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
goto out;
}
}
- es->len = 0;
- es1 = __es_tree_search(&tree->root, es->start);
+ es1 = __es_tree_search(&tree->root, lblk);
out:
- if (es1) {
- tree->cache_es = es1;
- es->start = es1->start;
- es->len = es1->len;
- node = rb_next(&es1->rb_node);
- if (node) {
+ if (es1 && !ext4_es_is_delayed(es1)) {
+ while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
- ret = es1->start;
+ if (ext4_es_is_delayed(es1))
+ break;
}
}
+ if (es1 && ext4_es_is_delayed(es1)) {
+ tree->cache_es = es1;
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
read_unlock(&EXT4_I(inode)->i_es_lock);
- trace_ext4_es_find_extent_exit(inode, es, ret);
- return ret;
+ ext4_es_lru_add(inode);
+ trace_ext4_es_find_delayed_extent_exit(inode, es);
}
static struct extent_status *
-ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ ext4_fsblk_t pblk)
{
struct extent_status *es;
es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
if (es == NULL)
return NULL;
- es->start = start;
- es->len = len;
+ es->es_lblk = lblk;
+ es->es_len = len;
+ es->es_pblk = pblk;
+
+ /*
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es))
+ EXT4_I(inode)->i_es_lru_nr++;
+
return es;
}
-static void ext4_es_free_extent(struct extent_status *es)
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
+ /* Decrease the lru counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+ EXT4_I(inode)->i_es_lru_nr--;
+ }
+
kmem_cache_free(ext4_es_cachep, es);
}
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ * - logical block number is contiguous
+ * - physical block number is contiguous
+ * - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+ struct extent_status *es2)
+{
+ if (es1->es_lblk + es1->es_len != es2->es_lblk)
+ return 0;
+
+ if (ext4_es_status(es1) != ext4_es_status(es2))
+ return 0;
+
+ if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+ (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2)))
+ return 0;
+
+ return 1;
+}
+
static struct extent_status *
-ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -290,10 +354,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es->start == extent_status_end(es1) + 1) {
- es1->len += es->len;
+ if (ext4_es_can_be_merged(es1, es)) {
+ es1->es_len += es->es_len;
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
es = es1;
}
@@ -301,8 +365,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
}
static struct extent_status *
-ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -311,69 +376,57 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es1->start == extent_status_end(es) + 1) {
- es->len += es1->len;
+ if (ext4_es_can_be_merged(es, es1)) {
+ es->es_len += es1->es_len;
rb_erase(node, &tree->root);
- ext4_es_free_extent(es1);
+ ext4_es_free_extent(inode, es1);
}
return es;
}
-static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
- ext4_lblk_t len)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
struct rb_node *parent = NULL;
struct extent_status *es;
- ext4_lblk_t end = offset + len - 1;
-
- BUG_ON(end < offset);
- es = tree->cache_es;
- if (es && offset == (extent_status_end(es) + 1)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
- goto out;
- } else if (es && es->start == end + 1) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
- goto out;
- } else if (es && es->start <= offset &&
- end <= extent_status_end(es)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- goto out;
- }
while (*p) {
parent = *p;
es = rb_entry(parent, struct extent_status, rb_node);
- if (offset < es->start) {
- if (es->start == end + 1) {
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
+ if (newes->es_lblk < es->es_lblk) {
+ if (ext4_es_can_be_merged(newes, es)) {
+ /*
+ * Here we can modify es_lblk directly
+ * because it isn't overlapped.
+ */
+ es->es_lblk = newes->es_lblk;
+ es->es_len += newes->es_len;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es))
+ ext4_es_store_pblock(es,
+ newes->es_pblk);
+ es = ext4_es_try_to_merge_left(inode, es);
goto out;
}
p = &(*p)->rb_left;
- } else if (offset > extent_status_end(es)) {
- if (offset == extent_status_end(es) + 1) {
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
+ } else if (newes->es_lblk > ext4_es_end(es)) {
+ if (ext4_es_can_be_merged(es, newes)) {
+ es->es_len += newes->es_len;
+ es = ext4_es_try_to_merge_right(inode, es);
goto out;
}
p = &(*p)->rb_right;
} else {
- if (extent_status_end(es) <= end)
- es->len = offset - es->start + len;
- goto out;
+ BUG_ON(1);
+ return -EINVAL;
}
}
- es = ext4_es_alloc_extent(offset, len);
+ es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
if (!es)
return -ENOMEM;
rb_link_node(&es->rb_node, parent, p);
@@ -385,85 +438,166 @@ out:
}
/*
- * ext4_es_insert_extent() adds a space to a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_insert_extent() adds a space to a extent status tree.
*
* ext4_es_insert_extent is called by ext4_da_write_begin and
* ext4_es_remove_extent.
*
* Return 0 on success, error code on failure.
*/
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned long long status)
{
- struct ext4_es_tree *tree;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err = 0;
- trace_ext4_es_insert_extent(inode, offset, len);
- es_debug("add [%u/%u) to extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
+ es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, inode->i_ino);
+
+ if (!len)
+ return 0;
+
+ BUG_ON(end < lblk);
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock(&newes, pblk);
+ ext4_es_store_status(&newes, status);
+ trace_ext4_es_insert_extent(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- err = __es_insert_extent(tree, offset, len);
+ err = __es_remove_extent(inode, lblk, end);
+ if (err != 0)
+ goto error;
+ err = __es_insert_extent(inode, &newes);
+
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
}
/*
- * ext4_es_remove_extent() removes a space from a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
*
- * Return 0 on success, error code on failure.
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
*/
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es)
{
- struct rb_node *node;
struct ext4_es_tree *tree;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ int found = 0;
+
+ trace_ext4_es_lookup_extent_enter(inode, lblk);
+ es_debug("lookup extent in block %u\n", lblk);
+
+ tree = &EXT4_I(inode)->i_es_tree;
+ read_lock(&EXT4_I(inode)->i_es_lock);
+
+ /* find extent in cache firstly */
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
+ }
+ }
+
+ node = tree->root.rb_node;
+ while (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (lblk < es1->es_lblk)
+ node = node->rb_left;
+ else if (lblk > ext4_es_end(es1))
+ node = node->rb_right;
+ else {
+ found = 1;
+ break;
+ }
+ }
+
+out:
+ if (found) {
+ BUG_ON(!es1);
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_lru_add(inode);
+ trace_ext4_es_lookup_extent_exit(inode, es, found);
+ return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
struct extent_status *es;
struct extent_status orig_es;
- ext4_lblk_t len1, len2, end;
+ ext4_lblk_t len1, len2;
+ ext4_fsblk_t block;
int err = 0;
- trace_ext4_es_remove_extent(inode, offset, len);
- es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
-
- end = offset + len - 1;
- BUG_ON(end < offset);
- write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- es = __es_tree_search(&tree->root, offset);
+ es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
- if (es->start > end)
+ if (es->es_lblk > end)
goto out;
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
- orig_es.start = es->start;
- orig_es.len = es->len;
- len1 = offset > es->start ? offset - es->start : 0;
- len2 = extent_status_end(es) > end ?
- extent_status_end(es) - end : 0;
+ orig_es.es_lblk = es->es_lblk;
+ orig_es.es_len = es->es_len;
+ orig_es.es_pblk = es->es_pblk;
+
+ len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+ len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
if (len1 > 0)
- es->len = len1;
+ es->es_len = len1;
if (len2 > 0) {
if (len1 > 0) {
- err = __es_insert_extent(tree, end + 1, len2);
+ struct extent_status newes;
+
+ newes.es_lblk = end + 1;
+ newes.es_len = len2;
+ if (ext4_es_is_written(&orig_es) ||
+ ext4_es_is_unwritten(&orig_es)) {
+ block = ext4_es_pblock(&orig_es) +
+ orig_es.es_len - len2;
+ ext4_es_store_pblock(&newes, block);
+ }
+ ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+ err = __es_insert_extent(inode, &newes);
if (err) {
- es->start = orig_es.start;
- es->len = orig_es.len;
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
goto out;
}
} else {
- es->start = end + 1;
- es->len = len2;
+ es->es_lblk = end + 1;
+ es->es_len = len2;
+ if (ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es)) {
+ block = orig_es.es_pblk + orig_es.es_len - len2;
+ ext4_es_store_pblock(es, block);
+ }
}
goto out;
}
@@ -476,10 +610,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = NULL;
}
- while (es && extent_status_end(es) <= end) {
+ while (es && ext4_es_end(es) <= end) {
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
if (!node) {
es = NULL;
break;
@@ -487,14 +621,183 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = rb_entry(node, struct extent_status, rb_node);
}
- if (es && es->start < end + 1) {
- len1 = extent_status_end(es) - end;
- es->start = end + 1;
- es->len = len1;
+ if (es && es->es_lblk < end + 1) {
+ ext4_lblk_t orig_len = es->es_len;
+
+ len1 = ext4_es_end(es) - end;
+ es->es_lblk = end + 1;
+ es->es_len = len1;
+ if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+ block = es->es_pblk + orig_len - len1;
+ ext4_es_store_pblock(es, block);
+ }
}
out:
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ ext4_lblk_t end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, lblk, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+
+ if (!len)
+ return err;
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
return err;
}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ struct ext4_inode_info *ei;
+ struct list_head *cur, *tmp, scanned;
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk = 0;
+
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan);
+
+ if (!nr_to_scan)
+ return ext4_es_reclaim_extents_count(sbi->s_sb);
+
+ INIT_LIST_HEAD(&scanned);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+ list_move_tail(cur, &scanned);
+
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+ read_lock(&ei->i_es_lock);
+ if (ei->i_es_lru_nr == 0) {
+ read_unlock(&ei->i_es_lock);
+ continue;
+ }
+ read_unlock(&ei->i_es_lock);
+
+ write_lock(&ei->i_es_lock);
+ ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ write_unlock(&ei->i_es_lock);
+
+ nr_shrunk += ret;
+ nr_to_scan -= ret;
+ if (nr_to_scan == 0)
+ break;
+ }
+ list_splice_tail(&scanned, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk);
+
+ return ext4_es_reclaim_extents_count(sbi->s_sb);
+}
+
+void ext4_es_register_shrinker(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+ INIT_LIST_HEAD(&sbi->s_es_lru);
+ spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_shrinker.shrink = ext4_es_shrink;
+ sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct super_block *sb)
+{
+ unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (list_empty(&ei->i_es_lru))
+ list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ else
+ list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (!list_empty(&ei->i_es_lru))
+ list_del_init(&ei->i_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int ext4_es_reclaim_extents_count(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ struct list_head *cur;
+ int nr_cached = 0;
+
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each(cur, &sbi->s_es_lru) {
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+ read_lock(&ei->i_es_lock);
+ nr_cached += ei->i_es_lru_nr;
+ read_unlock(&ei->i_es_lock);
+ }
+ spin_unlock(&sbi->s_es_lru_lock);
+ trace_ext4_es_reclaim_extents_count(sb, nr_cached);
+ return nr_cached;
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan)
+{
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ int nr_shrunk = 0;
+
+ if (ei->i_es_lru_nr == 0)
+ return 0;
+
+ node = rb_first(&tree->root);
+ while (node != NULL) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ nr_shrunk++;
+ if (--nr_to_scan == 0)
+ break;
+ }
+ }
+ tree->cache_es = NULL;
+ return nr_shrunk;
+}