aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/transaction.c4
-rw-r--r--fs/btrfs/tree-log.c363
-rw-r--r--fs/ext4/ext4.h23
-rw-r--r--fs/ext4/move_extent.c11
-rw-r--r--fs/ext4/super.c47
5 files changed, 426 insertions, 22 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7dce00b91a71..240019f36b2a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -723,7 +723,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->ordered)) {
spin_lock(&info->trans_lock);
- list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
spin_unlock(&info->trans_lock);
}
@@ -1732,7 +1732,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
- list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3b68c75eccea..f6c20cf6090e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3929,6 +3929,308 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
return 0;
}
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ int ret;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ int ins_nr = 0;
+ int start_slot = 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ int nritems = btrfs_header_nritems(leaf);
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ * 1) create file with 128Kb of data
+ * 2) truncate file to 64Kb
+ * 3) truncate file to 256Kb
+ * 4) fsync file
+ * 5) <crash/power failure>
+ * 6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ u64 hole_start;
+ u64 hole_size;
+ struct extent_buffer *leaf;
+ struct btrfs_root *log = root->log_root;
+ const u64 ino = btrfs_ino(inode);
+ const u64 i_size = i_size_read(inode);
+
+ if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* inode does not have any extents */
+ hole_start = 0;
+ hole_size = i_size;
+ } else {
+ struct btrfs_file_extent_item *extent;
+ u64 len;
+
+ /*
+ * If there's an extent beyond i_size, an explicit hole was
+ * already inserted by copy_items().
+ */
+ if (key.offset >= i_size)
+ return 0;
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(leaf,
+ path->slots[0],
+ extent);
+ ASSERT(len == i_size);
+ return 0;
+ }
+
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ /* Last extent goes beyond i_size, no need to log a hole. */
+ if (key.offset + len > i_size)
+ return 0;
+ hole_start = key.offset + len;
+ hole_size = i_size - hole_start;
+ }
+ btrfs_release_path(path);
+
+ /* Last extent ends at i_size. */
+ if (hole_size == 0)
+ return 0;
+
+ hole_size = ALIGN(hole_size, root->sectorsize);
+ ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+ hole_size, 0, hole_size, 0, 0, 0);
+ return ret;
+}
+
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ const int slot,
+ const struct btrfs_key *key,
+ struct inode *inode)
+{
+ int ret;
+ struct btrfs_path *search_path;
+ char *name = NULL;
+ u32 name_len = 0;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+ search_path = btrfs_alloc_path();
+ if (!search_path)
+ return -ENOMEM;
+ search_path->search_commit_root = 1;
+ search_path->skip_locking = 1;
+
+ while (cur_offset < item_size) {
+ u64 parent;
+ u32 this_name_len;
+ u32 this_len;
+ unsigned long name_ptr;
+ struct btrfs_dir_item *di;
+
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+
+ iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ parent = key->offset;
+ this_name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_ptr = (unsigned long)(iref + 1);
+ this_len = sizeof(*iref) + this_name_len;
+ } else {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ this_name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_ptr = (unsigned long)&extref->name;
+ this_len = sizeof(*extref) + this_name_len;
+ }
+
+ if (this_name_len > name_len) {
+ char *new_name;
+
+ new_name = krealloc(name, this_name_len, GFP_NOFS);
+ if (!new_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ name_len = this_name_len;
+ name = new_name;
+ }
+
+ read_extent_buffer(eb, name, name_ptr, this_name_len);
+ di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
+ search_path, parent,
+ name, this_name_len, 0);
+ if (di && !IS_ERR(di)) {
+ ret = 1;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ btrfs_release_path(search_path);
+
+ cur_offset += this_len;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(search_path);
+ kfree(name);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -3967,6 +4269,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 logged_isize = 0;
+ bool need_log_inode_item = true;
path = btrfs_alloc_path();
if (!path)
@@ -4056,11 +4359,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
- ret = log_inode_item(trans, log, dst_path, inode);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
goto log_extents;
}
@@ -4083,6 +4381,44 @@ again:
if (min_key.type > max_key.type)
break;
+ if (min_key.type == BTRFS_INODE_ITEM_KEY)
+ need_log_inode_item = false;
+
+ if ((min_key.type == BTRFS_INODE_REF_KEY ||
+ min_key.type == BTRFS_INODE_EXTREF_KEY) &&
+ BTRFS_I(inode)->generation == trans->transid) {
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0],
+ &min_key, inode);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ } else if (ret > 0) {
+ err = 1;
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ goto out_unlock;
+ }
+ }
+
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+ if (ret) {
+ btrfs_release_path(path);
+ continue;
+ }
+ goto next_slot;
+ }
+
src = path->nodes[0];
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
@@ -4150,9 +4486,26 @@ next_slot:
ins_nr = 0;
}
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ if (err)
+ goto out_unlock;
+ if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_trailing_hole(trans, root, inode, path);
+ if (err)
+ goto out_unlock;
+ }
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
+ if (need_log_inode_item) {
+ err = log_inode_item(trans, log, dst_path, inode);
+ if (err)
+ goto out_unlock;
+ }
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
&logged_list, ctx);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c55a1faaed58..6ce71e4e82e5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -811,6 +811,29 @@ do { \
#include "extents_status.h"
/*
+ * Lock subclasses for i_data_sem in the ext4_inode_info structure.
+ *
+ * These are needed to avoid lockdep false positives when we need to
+ * allocate blocks to the quota inode during ext4_map_blocks(), while
+ * holding i_data_sem for a normal (non-quota) inode. Since we don't
+ * do quota tracking for the quota inode, this avoids deadlock (as
+ * well as infinite recursion, since it isn't turtles all the way
+ * down...)
+ *
+ * I_DATA_SEM_NORMAL - Used for most inodes
+ * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode
+ * where the second inode has larger inode number
+ * than the first
+ * I_DATA_SEM_QUOTA - Used for quota inodes only
+ */
+enum {
+ I_DATA_SEM_NORMAL = 0,
+ I_DATA_SEM_OTHER,
+ I_DATA_SEM_QUOTA,
+};
+
+
+/*
* fourth extended file system inode data in memory
*/
struct ext4_inode_info {
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index f498c34b4688..4d1049517e4a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -60,10 +60,10 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
} else {
down_write(&EXT4_I(second)->i_data_sem);
- down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
}
}
@@ -491,6 +491,13 @@ mext_check_arguments(struct inode *orig_inode,
return -EBUSY;
}
+ if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be quota files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EBUSY;
+ }
+
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b16ba5239dcf..a5d94f263870 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1281,9 +1281,9 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
return -1;
}
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
- ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
- "when QUOTA feature is enabled");
- return -1;
+ ext4_msg(sb, KERN_INFO, "Journaled quota options "
+ "ignored when QUOTA feature is enabled");
+ return 1;
}
qname = match_strdup(args);
if (!qname) {
@@ -1622,10 +1622,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_QUOTA)) {
- ext4_msg(sb, KERN_ERR,
- "Cannot set journaled quota options "
+ ext4_msg(sb, KERN_INFO,
+ "Quota format mount options ignored "
"when QUOTA feature is enabled");
- return -1;
+ return 1;
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
@@ -1677,11 +1677,11 @@ static int parse_options(char *options, struct super_block *sb,
#ifdef CONFIG_QUOTA
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
(test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
- ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
- "feature is enabled");
- return 0;
- }
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota "
+ "mount options ignored.");
+ clear_opt(sb, USRQUOTA);
+ clear_opt(sb, GRPQUOTA);
+ } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
@@ -5187,6 +5187,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
EXT4_SB(sb)->s_jquota_fmt, type);
}
+static void lockdep_set_quota_inode(struct inode *inode, int subclass)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ /* The first argument of lockdep_set_subclass has to be
+ * *exactly* the same as the argument to init_rwsem() --- in
+ * this case, in init_once() --- or lockdep gets unhappy
+ * because the name of the lock is set using the
+ * stringification of the argument to init_rwsem().
+ */
+ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
+ lockdep_set_subclass(&ei->i_data_sem, subclass);
+}
+
/*
* Standard function to be called on quota_on
*/
@@ -5226,8 +5240,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
if (err)
return err;
}
-
- return dquot_quota_on(sb, type, format_id, path);
+ lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
+ return err;
}
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
@@ -5253,8 +5271,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
+ if (err)
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
return err;
}