diff options
Diffstat (limited to 'fs')
39 files changed, 1338 insertions, 412 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index ccfd31f1df3a..b1aa9877acba 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -14,6 +14,7 @@ #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> @@ -546,7 +547,8 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -static struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __read_mostly; +EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { @@ -687,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } -int sb_is_blkdev_sb(struct super_block *sb) -{ - return sb == blockdev_superblock; -} - /* Call when you free inode */ void bd_forget(struct inode *inode) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 99e8f60c7962..ff10d6e093f5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3272,11 +3272,8 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio, int err) { - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - } if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3304,11 +3301,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("BTRFS: disabling barriers on dev %s\n", - rcu_str_deref(device->name)); - device->nobarriers = 1; - } else if (!bio_flagged(bio, BIO_UPTODATE)) { + if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 885f533a34d9..32e0df2d0bd6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, else btrfsic_submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; bio_put(bio); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 2907544c3a1d..8ba5bf01d341 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -30,6 +30,7 @@ #include <linux/quotaops.h> #include <linux/highmem.h> #include <linux/export.h> +#include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/hash.h> #include <linux/suspend.h> @@ -44,6 +45,9 @@ #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -615,21 +619,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. + * + * The caller must hold mem_cgroup_begin_page_stat() lock. */ -static void __set_page_dirty(struct page *page, - struct address_space *mapping, int warn) +static void __set_page_dirty(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, int warn) { unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } /* @@ -660,6 +665,7 @@ static void __set_page_dirty(struct page *page, int __set_page_dirty_buffers(struct page *page) { int newly_dirty; + struct mem_cgroup *memcg; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -675,11 +681,22 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, 1); + __set_page_dirty(page, mapping, memcg, 1); + + mem_cgroup_end_page_stat(memcg); + + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1150,11 +1167,18 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; + struct address_space *mapping = NULL; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); + mapping = page_mapping(page); if (mapping) - __set_page_dirty(page, mapping, 0); + __set_page_dirty(page, mapping, memcg, 0); } + mem_cgroup_end_page_stat(memcg); + if (mapping) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } EXPORT_SYMBOL(mark_buffer_dirty); @@ -1676,8 +1700,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1766,7 +1789,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -1820,7 +1843,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -2930,10 +2953,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) { struct buffer_head *bh = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - } - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) set_bit(BH_Quiet, &bh->b_state); @@ -2989,7 +3008,8 @@ void guard_bio_eod(int rw, struct bio *bio) } } -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; int ret = 0; @@ -3012,6 +3032,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) */ bio = bio_alloc(GFP_NOIO, 1); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; @@ -3033,20 +3058,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) if (buffer_prio(bh)) rw |= REQ_PRIO; - bio_get(bio); submit_bio(rw, bio); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - - bio_put(bio); return ret; } + +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +{ + return submit_bh_wbc(rw, bh, bio_flags, NULL); +} EXPORT_SYMBOL_GPL(_submit_bh); int submit_bh(int rw, struct buffer_head *bh) { - return _submit_bh(rw, bh, 0); + return submit_bh_wbc(rw, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3235,8 +3259,8 @@ int try_to_free_buffers(struct page *page) * to synchronise against __set_page_dirty_buffers and prevent the * dirty bit from being lost. */ - if (ret && TestClearPageDirty(page)) - account_page_cleaned(page, mapping); + if (ret) + cancel_dirty_page(page); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 830a7e76f5c6..39916043da10 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -54,6 +54,22 @@ const struct inode_operations debugfs_link_operations = { .follow_link = debugfs_follow_link, }; +static struct dentry *debugfs_create_mode(const char *name, umode_t mode, + struct dentry *parent, void *value, + const struct file_operations *fops, + const struct file_operations *fops_ro, + const struct file_operations *fops_wo) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, fops_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, fops_wo); + + return debugfs_create_file(name, mode, parent, value, fops); +} + static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; @@ -95,14 +111,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u8_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u8_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u8); + return debugfs_create_mode(name, mode, parent, value, &fops_u8, + &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -147,14 +157,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u16_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u16_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u16); + return debugfs_create_mode(name, mode, parent, value, &fops_u16, + &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -199,14 +203,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u32_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u32_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u32); + return debugfs_create_mode(name, mode, parent, value, &fops_u32, + &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); @@ -252,17 +250,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u64_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u64_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u64); + return debugfs_create_mode(name, mode, parent, value, &fops_u64, + &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); +static int debugfs_ulong_set(void *data, u64 val) +{ + *(unsigned long *)data = val; + return 0; +} + +static int debugfs_ulong_get(void *data, u64 *val) +{ + *val = *(unsigned long *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); + +/** + * debugfs_create_ulong - create a debugfs file that is used to read and write + * an unsigned long value. + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_ulong(const char *name, umode_t mode, + struct dentry *parent, unsigned long *value) +{ + return debugfs_create_mode(name, mode, parent, value, &fops_ulong, + &fops_ulong_ro, &fops_ulong_wo); +} +EXPORT_SYMBOL_GPL(debugfs_create_ulong); + DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); @@ -276,6 +316,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value @@ -298,14 +340,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x8_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x8_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x8); + return debugfs_create_mode(name, mode, parent, value, &fops_x8, + &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -322,14 +358,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x16_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x16_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x16); + return debugfs_create_mode(name, mode, parent, value, &fops_x16, + &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -346,14 +376,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x32_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x32_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x32); + return debugfs_create_mode(name, mode, parent, value, &fops_x32, + &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -370,7 +394,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - return debugfs_create_file(name, mode, parent, value, &fops_x64); + return debugfs_create_mode(name, mode, parent, value, &fops_x64, + &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); @@ -387,6 +412,8 @@ static int debugfs_size_t_get(void *data, u64 *val) } DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value @@ -401,7 +428,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { - return debugfs_create_file(name, mode, parent, value, &fops_size_t); + return debugfs_create_mode(name, mode, parent, value, &fops_size_t, + &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); @@ -434,24 +462,16 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, - &fops_atomic_t_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, - &fops_atomic_t_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_atomic_t); + return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t, + &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); -static ssize_t read_file_bool(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[3]; - u32 *val = file->private_data; + bool *val = file->private_data; if (*val) buf[0] = 'Y'; @@ -461,14 +481,15 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } +EXPORT_SYMBOL_GPL(debugfs_read_file_bool); -static ssize_t write_file_bool(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; bool bv; - u32 *val = file->private_data; + bool *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -480,10 +501,23 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, return count; } +EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { - .read = read_file_bool, - .write = write_file_bool, + .read = debugfs_read_file_bool, + .write = debugfs_write_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_bool_ro = { + .read = debugfs_read_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_bool_wo = { + .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; @@ -513,9 +547,10 @@ static const struct file_operations fops_bool = { * code. */ struct dentry *debugfs_create_bool(const char *name, umode_t mode, - struct dentry *parent, u32 *value) + struct dentry *parent, bool *value) { - return debugfs_create_file(name, mode, parent, value, &fops_bool); + return debugfs_create_mode(name, mode, parent, value, &fops_bool, + &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d0e746e96511..900e19cf9ef6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_iflags |= SB_I_CGROUPWB; if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9a83f149ac85..8b033f24a6b0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -191,7 +191,7 @@ typedef struct ext4_io_end { } ext4_io_end_t; struct ext4_io_submit { - int io_op; + struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; @@ -873,6 +873,15 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -2287,6 +2296,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); @@ -2632,6 +2642,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) return changed; } +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 87ba10d1d3bc..677955986ad7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> #include <linux/fiemap.h> +#include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -4741,7 +4742,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; - struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4757,17 +4757,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + len - 1); - if (ret) - return ret; - } - - /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4810,6 +4799,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4818,7 +4811,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) - goto out_mutex; + goto out_dio; } @@ -4827,16 +4820,23 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* Now release the pages and zero block aligned part of pages*/ + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_dio; + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -4964,8 +4964,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5424,21 +5429,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down offset to be aligned with page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5454,17 +5445,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* + * Write tail of the last page before removed range since it will get + * removed from the page cache below. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + if (ret) + goto out_mmap; + /* + * Write data that will be shifted to preserve them when discarding + * page cache below. We are also protected from pages becoming dirty + * by i_mmap_sem. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5503,7 +5520,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0613c256c344..dd65fac5ff2f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -213,7 +213,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = { #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, + .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2b3a53a51582..3291e1af0e24 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3524,6 +3524,35 @@ int ext4_can_truncate(struct inode *inode) } /* + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen after we truncate page cache. + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + loff_t size = i_size_read(inode); + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + if (offset > size || offset + len < size) + return 0; + + if (EXT4_I(inode)->i_disksize >= size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_update_i_disksize(inode, size); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return 0; +} + +/* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length * @@ -3588,17 +3617,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) + if (last_block_offset > first_block_offset) { + ret = ext4_update_disksize_before_punch(inode, offset, length); + if (ret) + goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3645,16 +3683,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - /* Now release the pages again to reduce race window */ - if (last_block_offset > first_block_offset) - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); out_dio: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); @@ -4775,11 +4809,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } else ext4_wait_for_tail_page_commit(inode); } + down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* * We want to call ext4_truncate() even if attr->ia_size == @@ -5234,6 +5270,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5303,6 +5341,19 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&EXT4_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + + return err; +} diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 41260489d3bc..5b1613a54307 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -26,6 +26,7 @@ #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/backing-dev.h> #include <trace/events/ext4.h> #ifdef CONFIG_EXT4_DEBUG diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 8082565c59a9..00cda09e4412 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -357,9 +357,10 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { + int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE; bio_get(io->io_bio); - submit_bio(io->io_op, io->io_bio); - BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + submit_bio(io_op, io->io_bio); bio_put(io->io_bio); } io->io_bio = NULL; @@ -368,7 +369,7 @@ void ext4_io_submit(struct ext4_io_submit *io) void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc) { - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_wbc = wbc; io->io_bio = NULL; io->io_end = NULL; } @@ -382,6 +383,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); if (!bio) return -ENOMEM; + wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; @@ -410,6 +412,7 @@ submit_and_retry: ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; + wbc_account_io(io->io_wbc, page, bh->b_size); io->io_next_block++; return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a3b9f14d198..abb7ec3f19b9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> @@ -945,6 +946,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); + init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } @@ -3649,6 +3651,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; } sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 011ba6670d99..c70d06a383e2 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ab0cf1930bd..d211602e0f86 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); @@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; } return res; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 85d7fa7514b2..aba72f7a8ac4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) @@ -713,7 +714,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) diff --git a/fs/fat/file.c b/fs/fat/file.c index 442d50a0e33e..a08f1039909a 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -11,6 +11,7 @@ #include <linux/compat.h> #include <linux/mount.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fsnotify.h> #include <linux/security.h> #include "fat.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c06774658345..509411dd3698 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,6 +18,7 @@ #include <linux/parser.h> #include <linux/uio.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <asm/unaligned.h> #include "fat.h" diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 32a8bbd7a9ad..518c6294bf6c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> +#include <linux/memcontrol.h> #include "internal.h" /* @@ -34,6 +35,10 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) +struct wb_completion { + atomic_t cnt; +}; + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -47,13 +52,29 @@ struct wb_writeback_work { unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + unsigned int auto_free:1; /* free on completion */ + unsigned int single_wait:1; + unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ - struct completion *done; /* set if the caller waits */ + struct wb_completion *done; /* set if the caller waits */ }; /* + * If one wants to wait for one or more wb_writeback_works, each work's + * ->done should be set to a wb_completion defined using the following + * macro. Once all work items are issued with wb_queue_work(), the caller + * can wait for the completion of all using wb_wait_for_completion(). Work + * items which are waited upon aren't freed automatically on completion. + */ +#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ + struct wb_completion cmpl = { \ + .cnt = ATOMIC_INIT(1), \ + } + + +/* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its @@ -65,35 +86,6 @@ struct wb_writeback_work { */ unsigned int dirtytime_expire_interval = 12 * 60 * 60; -/** - * writeback_in_progress - determine whether there is writeback in progress - * @bdi: the device's backing_dev_info structure. - * - * Determine whether there is writeback waiting to be handled against a - * backing device. - */ -int writeback_in_progress(struct backing_dev_info *bdi) -{ - return test_bit(BDI_writeback_running, &bdi->state); -} -EXPORT_SYMBOL(writeback_in_progress); - -struct backing_dev_info *inode_to_bdi(struct inode *inode) -{ - struct super_block *sb; - - if (!inode) - return &noop_backing_dev_info; - - sb = inode->i_sb; -#ifdef CONFIG_BLOCK - if (sb_is_blkdev_sb(sb)) - return blk_get_backing_dev_info(I_BDEV(inode)); -#endif - return sb->s_bdi; -} -EXPORT_SYMBOL_GPL(inode_to_bdi); - static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_wb_list); @@ -109,45 +101,831 @@ static inline struct inode *wb_inode(struct list_head *head) EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); -static void bdi_wakeup_thread(struct backing_dev_info *bdi) +static bool wb_io_lists_populated(struct bdi_writeback *wb) +{ + if (wb_has_dirty_io(wb)) { + return false; + } else { + set_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(!wb->avg_write_bandwidth); + atomic_long_add(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth); + return true; + } +} + +static void wb_io_lists_depopulated(struct bdi_writeback *wb) { - spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - spin_unlock_bh(&bdi->wb_lock); + if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && + list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { + clear_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth) < 0); + } } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/** + * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * @inode: inode to be moved + * @wb: target bdi_writeback + * @head: one of @wb->b_{dirty|io|more_io} + * + * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Returns %true if @inode is the first occupant of the !dirty_time IO + * lists; otherwise, %false. + */ +static bool inode_wb_list_move_locked(struct inode *inode, + struct bdi_writeback *wb, + struct list_head *head) { - trace_writeback_queue(bdi, work); + assert_spin_locked(&wb->list_lock); + + list_move(&inode->i_wb_list, head); - spin_lock_bh(&bdi->wb_lock); - if (!test_bit(BDI_registered, &bdi->state)) { - if (work->done) - complete(work->done); + /* dirty_time doesn't count as dirty_io until expiration */ + if (head != &wb->b_dirty_time) + return wb_io_lists_populated(wb); + + wb_io_lists_depopulated(wb); + return false; +} + +/** + * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * @inode: inode to be removed + * @wb: bdi_writeback @inode is being removed from + * + * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and + * clear %WB_has_dirty_io if all are empty afterwards. + */ +static void inode_wb_list_del_locked(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + + list_del_init(&inode->i_wb_list); + wb_io_lists_depopulated(wb); +} + +static void wb_wakeup(struct bdi_writeback *wb) +{ + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + spin_unlock_bh(&wb->work_lock); +} + +static void wb_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + trace_writeback_queue(wb->bdi, work); + + spin_lock_bh(&wb->work_lock); + if (!test_bit(WB_registered, &wb->state)) { + if (work->single_wait) + work->single_done = 1; goto out_unlock; } - list_add_tail(&work->list, &bdi->work_list); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + if (work->done) + atomic_inc(&work->done->cnt); + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); out_unlock: - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); +} + +/** + * wb_wait_for_completion - wait for completion of bdi_writeback_works + * @bdi: bdi work items were issued to + * @done: target wb_completion + * + * Wait for one or more work items issued to @bdi with their ->done field + * set to @done, which should have been defined with + * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such + * work items are completed. Work items which are waited upon aren't freed + * automatically on completion. + */ +static void wb_wait_for_completion(struct backing_dev_info *bdi, + struct wb_completion *done) +{ + atomic_dec(&done->cnt); /* put down the initial count */ + wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +/* parameters for foreign inode detection, see wb_detach_inode() */ +#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ +#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ +#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ + +#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ +#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) + /* each slot's duration is 2s / 16 */ +#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) + /* if foreign slots >= 8, switch */ +#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) + /* one round can affect upto 5 slots */ + +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + +/** + * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it + * @inode: inode of interest with i_lock held + * + * Returns @inode's wb with its list_lock held. @inode->i_lock must be + * held on entry and is released on return. The returned wb is guaranteed + * to stay @inode's associated wb until its list_lock is released. + */ +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + while (true) { + struct bdi_writeback *wb = inode_to_wb(inode); + + /* + * inode_to_wb() association is protected by both + * @inode->i_lock and @wb->list_lock but list_lock nests + * outside i_lock. Drop i_lock and verify that the + * association hasn't changed after acquiring list_lock. + */ + wb_get(wb); + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + wb_put(wb); /* not gonna deref it anymore */ + + /* i_wb may have changed inbetween, can't use inode_to_wb() */ + if (likely(wb == inode->i_wb)) + return wb; /* @inode already has ref */ + + spin_unlock(&wb->list_lock); + cpu_relax(); + spin_lock(&inode->i_lock); + } +} + +/** + * inode_to_wb_and_lock_list - determine an inode's wb and lock it + * @inode: inode of interest + * + * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held + * on entry. + */ +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + spin_lock(&inode->i_lock); + return locked_inode_to_wb_and_lock_list(inode); +} + +struct inode_switch_wbs_context { + struct inode *inode; + struct bdi_writeback *new_wb; + + struct rcu_head rcu_head; + struct work_struct work; +}; + +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(work, struct inode_switch_wbs_context, work); + struct inode *inode = isw->inode; + struct address_space *mapping = inode->i_mapping; + struct bdi_writeback *old_wb = inode->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + struct radix_tree_iter iter; + bool switched = false; + void **slot; + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against mapping->tree_lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + spin_lock(&inode->i_lock); + spin_lock_irq(&mapping->tree_lock); + + /* + * Once I_FREEING is visible under i_lock, the eviction path owns + * the inode and we shouldn't modify ->i_wb_list. + */ + if (unlikely(inode->i_state & I_FREEING)) + goto skip_switch; + + /* + * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points + * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to + * pages actually under underwriteback. + */ + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_DIRTY) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page) && PageDirty(page)) { + __dec_wb_stat(old_wb, WB_RECLAIMABLE); + __inc_wb_stat(new_wb, WB_RECLAIMABLE); + } + } + + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_WRITEBACK) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page)) { + WARN_ON_ONCE(!PageWriteback(page)); + __dec_wb_stat(old_wb, WB_WRITEBACK); + __inc_wb_stat(new_wb, WB_WRITEBACK); + } + } + + wb_get(new_wb); + + /* + * Transfer to @new_wb's IO list if necessary. The specific list + * @inode was on is ignored and the inode is put on ->b_dirty which + * is always correct including from ->b_dirty_time. The transfer + * preserves @inode->dirtied_when ordering. + */ + if (!list_empty(&inode->i_wb_list)) { + struct inode *pos; + + inode_wb_list_del_locked(inode, old_wb); + inode->i_wb = new_wb; + list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + } else { + inode->i_wb = new_wb; + } + + /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; + switched = true; +skip_switch: + /* + * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * ensures that the new wb is visible if they see !I_WB_SWITCH. + */ + smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + + spin_unlock_irq(&mapping->tree_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + + if (switched) { + wb_wakeup(new_wb); + wb_put(old_wb); + } + wb_put(new_wb); + + iput(inode); + kfree(isw); +} + +static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) +{ + struct inode_switch_wbs_context *isw = container_of(rcu_head, + struct inode_switch_wbs_context, rcu_head); + + /* needs to grab bh-unsafe locks, bounce to work item */ + INIT_WORK(&isw->work, inode_switch_wbs_work_fn); + schedule_work(&isw->work); +} + +/** + * inode_switch_wbs - change the wb association of an inode + * @inode: target inode + * @new_wb_id: ID of the new wb + * + * Switch @inode's wb association to the wb identified by @new_wb_id. The + * switching is performed asynchronously and may fail silently. + */ +static void inode_switch_wbs(struct inode *inode, int new_wb_id) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + + /* noop if seems to be already in progress */ + if (inode->i_state & I_WB_SWITCH) + return; + + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + if (!isw) + return; + + /* find and pin the new wb */ + rcu_read_lock(); + memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); + if (memcg_css) + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + rcu_read_unlock(); + if (!isw->new_wb) + goto out_free; + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode_to_wb(inode) == isw->new_wb) { + spin_unlock(&inode->i_lock); + goto out_free; + } + inode->i_state |= I_WB_SWITCH; + spin_unlock(&inode->i_lock); + + ihold(inode); + isw->inode = inode; + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the mapping's + * tree_lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + return; + +out_free: + if (isw->new_wb) + wb_put(isw->new_wb); + kfree(isw); +} + +/** + * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it + * @wbc: writeback_control of interest + * @inode: target inode + * + * @inode is locked and about to be written back under the control of @wbc. + * Record @inode's writeback context into @wbc and unlock the i_lock. On + * writeback completion, wbc_detach_inode() should be called. This is used + * to track the cgroup writeback context. + */ +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) +{ + if (!inode_cgwb_enabled(inode)) { + spin_unlock(&inode->i_lock); + return; + } + + wbc->wb = inode_to_wb(inode); + wbc->inode = inode; + + wbc->wb_id = wbc->wb->memcg_css->id; + wbc->wb_lcand_id = inode->i_wb_frn_winner; + wbc->wb_tcand_id = 0; + wbc->wb_bytes = 0; + wbc->wb_lcand_bytes = 0; + wbc->wb_tcand_bytes = 0; + + wb_get(wbc->wb); + spin_unlock(&inode->i_lock); + + /* + * A dying wb indicates that the memcg-blkcg mapping has changed + * and a new wb is already serving the memcg. Switch immediately. + */ + if (unlikely(wb_dying(wbc->wb))) + inode_switch_wbs(inode, wbc->wb_id); +} + +/** + * wbc_detach_inode - disassociate wbc from inode and perform foreign detection + * @wbc: writeback_control of the just finished writeback + * + * To be called after a writeback attempt of an inode finishes and undoes + * wbc_attach_and_unlock_inode(). Can be called under any context. + * + * As concurrent write sharing of an inode is expected to be very rare and + * memcg only tracks page ownership on first-use basis severely confining + * the usefulness of such sharing, cgroup writeback tracks ownership + * per-inode. While the support for concurrent write sharing of an inode + * is deemed unnecessary, an inode being written to by different cgroups at + * different points in time is a lot more common, and, more importantly, + * charging only by first-use can too readily lead to grossly incorrect + * behaviors (single foreign page can lead to gigabytes of writeback to be + * incorrectly attributed). + * + * To resolve this issue, cgroup writeback detects the majority dirtier of + * an inode and transfers the ownership to it. To avoid unnnecessary + * oscillation, the detection mechanism keeps track of history and gives + * out the switch verdict only if the foreign usage pattern is stable over + * a certain amount of time and/or writeback attempts. + * + * On each writeback attempt, @wbc tries to detect the majority writer + * using Boyer-Moore majority vote algorithm. In addition to the byte + * count from the majority voting, it also counts the bytes written for the + * current wb and the last round's winner wb (max of last round's current + * wb, the winner from two rounds ago, and the last round's majority + * candidate). Keeping track of the historical winner helps the algorithm + * to semi-reliably detect the most active writer even when it's not the + * absolute majority. + * + * Once the winner of the round is determined, whether the winner is + * foreign or not and how much IO time the round consumed is recorded in + * inode->i_wb_frn_history. If the amount of recorded foreign IO time is + * over a certain threshold, the switch verdict is given. + */ +void wbc_detach_inode(struct writeback_control *wbc) +{ + struct bdi_writeback *wb = wbc->wb; + struct inode *inode = wbc->inode; + unsigned long avg_time, max_bytes, max_time; + u16 history; + int max_id; + + if (!wb) + return; + + history = inode->i_wb_frn_history; + avg_time = inode->i_wb_frn_avg_time; + + /* pick the winner of this round */ + if (wbc->wb_bytes >= wbc->wb_lcand_bytes && + wbc->wb_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_id; + max_bytes = wbc->wb_bytes; + } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_lcand_id; + max_bytes = wbc->wb_lcand_bytes; + } else { + max_id = wbc->wb_tcand_id; + max_bytes = wbc->wb_tcand_bytes; + } + + /* + * Calculate the amount of IO time the winner consumed and fold it + * into the running average kept per inode. If the consumed IO + * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for + * deciding whether to switch or not. This is to prevent one-off + * small dirtiers from skewing the verdict. + */ + max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, + wb->avg_write_bandwidth); + if (avg_time) + avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - + (avg_time >> WB_FRN_TIME_AVG_SHIFT); + else + avg_time = max_time; /* immediate catch up on first run */ + + if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { + int slots; + + /* + * The switch verdict is reached if foreign wb's consume + * more than a certain proportion of IO time in a + * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot + * history mask where each bit represents one sixteenth of + * the period. Determine the number of slots to shift into + * history from @max_time. + */ + slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), + (unsigned long)WB_FRN_HIST_MAX_SLOTS); + history <<= slots; + if (wbc->wb_id != max_id) + history |= (1U << slots) - 1; + + /* + * Switch if the current wb isn't the consistent winner. + * If there are multiple closely competing dirtiers, the + * inode may switch across them repeatedly over time, which + * is okay. The main goal is avoiding keeping an inode on + * the wrong wb for an extended period of time. + */ + if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + inode_switch_wbs(inode, max_id); + } + + /* + * Multiple instances of this function may race to update the + * following fields but we don't mind occassional inaccuracies. + */ + inode->i_wb_frn_winner = max_id; + inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); + inode->i_wb_frn_history = history; + + wb_put(wbc->wb); + wbc->wb = NULL; +} + +/** + * wbc_account_io - account IO issued during writeback + * @wbc: writeback_control of the writeback in progress + * @page: page being written out + * @bytes: number of bytes being written out + * + * @bytes from @page are about to written out during the writeback + * controlled by @wbc. Keep the book for foreign inode detection. See + * wbc_detach_inode(). + */ +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes) +{ + int id; + + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (!wbc->wb) + return; + + rcu_read_lock(); + id = mem_cgroup_css_from_page(page)->id; + rcu_read_unlock(); + + if (id == wbc->wb_id) { + wbc->wb_bytes += bytes; + return; + } + + if (id == wbc->wb_lcand_id) + wbc->wb_lcand_bytes += bytes; + + /* Boyer-Moore majority vote algorithm */ + if (!wbc->wb_tcand_bytes) + wbc->wb_tcand_id = id; + if (id == wbc->wb_tcand_id) + wbc->wb_tcand_bytes += bytes; + else + wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } +EXPORT_SYMBOL_GPL(wbc_account_io); -static void -__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, enum wb_reason reason) +/** + * inode_congested - test whether an inode is congested + * @inode: inode to test for congestion + * @cong_bits: mask of WB_[a]sync_congested bits to test + * + * Tests whether @inode is congested. @cong_bits is the mask of congestion + * bits to test and the return value is the mask of set bits. + * + * If cgroup writeback is enabled for @inode, the congestion state is + * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg + * associated with @inode is congested; otherwise, the root wb's congestion + * state is used. + */ +int inode_congested(struct inode *inode, int cong_bits) +{ + /* + * Once set, ->i_wb never becomes NULL while the inode is alive. + * Start transaction iff ->i_wb is visible. + */ + if (inode && inode_to_wb_is_valid(inode)) { + struct bdi_writeback *wb; + bool locked, congested; + + wb = unlocked_inode_to_wb_begin(inode, &locked); + congested = wb_congested(wb, cong_bits); + unlocked_inode_to_wb_end(inode, locked); + return congested; + } + + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} +EXPORT_SYMBOL_GPL(inode_congested); + +/** + * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work + * @bdi: bdi the work item was issued to + * @work: work item to wait for + * + * Wait for the completion of @work which was issued to one of @bdi's + * bdi_writeback's. The caller must have set @work->single_wait before + * issuing it. This wait operates independently fo + * wb_wait_for_completion() and also disables automatic freeing of @work. + */ +static void wb_wait_for_single_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + if (WARN_ON_ONCE(!work->single_wait)) + return; + + wait_event(bdi->wb_waitq, work->single_done); + + /* + * Paired with smp_wmb() in wb_do_writeback() and ensures that all + * modifications to @work prior to assertion of ->single_done is + * visible to the caller once this function returns. + */ + smp_rmb(); +} + +/** + * wb_split_bdi_pages - split nr_pages to write according to bandwidth + * @wb: target bdi_writeback to split @nr_pages to + * @nr_pages: number of pages to write for the whole bdi + * + * Split @wb's portion of @nr_pages according to @wb's write bandwidth in + * relation to the total write bandwidth of all wb's w/ dirty inodes on + * @wb->bdi. + */ +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + + if (nr_pages == LONG_MAX) + return LONG_MAX; + + /* + * This may be called on clean wb's and proportional distribution + * may not make sense, just use the original @nr_pages in those + * cases. In general, we wanna err on the side of writing more. + */ + if (!tot_bw || this_bw >= tot_bw) + return nr_pages; + else + return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); +} + +/** + * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb + * @wb: target bdi_writeback + * @base_work: source wb_writeback_work + * + * Try to make a clone of @base_work and issue it to @wb. If cloning + * succeeds, %true is returned; otherwise, @base_work is issued directly + * and %false is returned. In the latter case, the caller is required to + * wait for @base_work's completion using wb_wait_for_single_work(). + * + * A clone is auto-freed on completion. @base_work never is. + */ +static bool wb_clone_and_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *base_work) { struct wb_writeback_work *work; + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->auto_free = 1; + work->single_wait = 0; + } else { + work = base_work; + work->auto_free = 0; + work->single_wait = 1; + } + work->single_done = 0; + wb_queue_work(wb, work); + return work != base_work; +} + +/** + * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi + * @bdi: target backing_dev_info + * @base_work: wb_writeback_work to issue + * @skip_if_busy: skip wb's which already have writeback in progress + * + * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which + * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's + * distributed to the busy wbs according to each wb's proportion in the + * total active write bandwidth of @bdi. + */ +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + long nr_pages = base_work->nr_pages; + int next_blkcg_id = 0; + struct bdi_writeback *wb; + struct wb_iter iter; + + might_sleep(); + + if (!bdi_has_dirty_io(bdi)) + return; +restart: + rcu_read_lock(); + bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { + if (!wb_has_dirty_io(wb) || + (skip_if_busy && writeback_in_progress(wb))) + continue; + + base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); + if (!wb_clone_and_queue_work(wb, base_work)) { + next_blkcg_id = wb->blkcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_single_work(bdi, base_work); + goto restart; + } + } + rcu_read_unlock(); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + return wb; +} + +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_lock(&wb->list_lock); + return wb; +} + +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + return nr_pages; +} + +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + might_sleep(); + + if (bdi_has_dirty_io(bdi) && + (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { + base_work->auto_free = 0; + base_work->single_wait = 0; + base_work->single_done = 0; + wb_queue_work(&bdi->wb, base_work); + } +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason) +{ + struct wb_writeback_work *work; + + if (!wb_has_dirty_io(wb)) + return; + /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_nowork(wb->bdi); + wb_wakeup(wb); return; } @@ -155,46 +933,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; work->reason = reason; + work->auto_free = 1; - bdi_queue_work(bdi, work); + wb_queue_work(wb, work); } /** - * bdi_start_writeback - start writeback - * @bdi: the backing device to write from - * @nr_pages: the number of pages to write - * @reason: reason why some writeback work was initiated - * - * Description: - * This does WB_SYNC_NONE opportunistic writeback. The IO is only - * started when this function returns, we make no guarantees on - * completion. Caller need not hold sb s_umount semaphore. - * - */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason) -{ - __bdi_start_writeback(bdi, nr_pages, true, reason); -} - -/** - * bdi_start_background_writeback - start background writeback - * @bdi: the backing device to write from + * wb_start_background_writeback - start background writeback + * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When - * this function returns, it is only guaranteed that for given BDI + * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ -void bdi_start_background_writeback(struct backing_dev_info *bdi) +void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_wake_background(wb->bdi); + wb_wakeup(wb); } /* @@ -202,11 +963,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb; - spin_lock(&bdi->wb.list_lock); - list_del_init(&inode->i_wb_list); - spin_unlock(&bdi->wb.list_lock); + wb = inode_to_wb_and_lock_list(inode); + inode_wb_list_del_locked(inode, wb); + spin_unlock(&wb->list_lock); } /* @@ -220,7 +981,6 @@ void inode_wb_list_del(struct inode *inode) */ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -228,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_wb_list, &wb->b_dirty); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -236,8 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, &wb->b_more_io); + inode_wb_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -346,6 +1105,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, EXPIRE_DIRTY_ATIME, work); + if (moved) + wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, moved); } @@ -471,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &wb->b_dirty_time); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); } } @@ -605,10 +1366,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); + wbc_detach_inode(wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); /* @@ -616,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -624,7 +1386,7 @@ out: return ret; } -static long writeback_chunk_size(struct backing_dev_info *bdi, +static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -645,8 +1407,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { - pages = min(bdi->avg_write_bandwidth / 2, - global_dirty_limit / DIRTY_SCOPE); + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); @@ -741,9 +1503,9 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb->bdi, work); + write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -753,6 +1515,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); @@ -830,33 +1593,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, return nr_pages - work.nr_pages; } -static bool over_bground_thresh(struct backing_dev_info *bdi) -{ - unsigned long background_thresh, dirty_thresh; - - global_dirty_limits(&background_thresh, &dirty_thresh); - - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh) - return true; - - if (bdi_stat(bdi, BDI_RECLAIMABLE) > - bdi_dirty_limit(bdi, background_thresh)) - return true; - - return false; -} - -/* - * Called under wb->list_lock. If there are multiple wb per bdi, - * only the flusher working on the first wb should do it. - */ -static void wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long start_time) -{ - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); -} - /* * Explicit flushing or periodic writeback of "old" data. * @@ -899,14 +1635,14 @@ static long wb_writeback(struct bdi_writeback *wb, * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && - !list_empty(&wb->bdi->work_list)) + !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh(wb->bdi)) + if (work->for_background && !wb_over_bg_thresh(wb)) break; /* @@ -970,18 +1706,17 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi) +static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; - spin_lock_bh(&bdi->wb_lock); - if (!list_empty(&bdi->work_list)) { - work = list_entry(bdi->work_list.next, + spin_lock_bh(&wb->work_lock); + if (!list_empty(&wb->work_list)) { + work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); return work; } @@ -998,7 +1733,7 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh(wb->bdi)) { + if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, @@ -1053,25 +1788,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) */ static long wb_do_writeback(struct bdi_writeback *wb) { - struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; long wrote = 0; - set_bit(BDI_writeback_running, &wb->bdi->state); - while ((work = get_next_work_item(bdi)) != NULL) { + set_bit(WB_writeback_running, &wb->state); + while ((work = get_next_work_item(wb)) != NULL) { + struct wb_completion *done = work->done; + bool need_wake_up = false; - trace_writeback_exec(bdi, work); + trace_writeback_exec(wb->bdi, work); wrote += wb_writeback(wb, work); - /* - * Notify the caller of completion if this is a synchronous - * work item, otherwise just free it. - */ - if (work->done) - complete(work->done); - else + if (work->single_wait) { + WARN_ON_ONCE(work->auto_free); + /* paired w/ rmb in wb_wait_for_single_work() */ + smp_wmb(); + work->single_done = 1; + need_wake_up = true; + } else if (work->auto_free) { kfree(work); + } + + if (done && atomic_dec_and_test(&done->cnt)) + need_wake_up = true; + + if (need_wake_up) + wake_up_all(&wb->bdi->wb_waitq); } /* @@ -1079,7 +1822,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); - clear_bit(BDI_writeback_running, &wb->bdi->state); + clear_bit(WB_writeback_running, &wb->state); return wrote; } @@ -1088,43 +1831,42 @@ static long wb_do_writeback(struct bdi_writeback *wb) * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ -void bdi_writeback_workfn(struct work_struct *work) +void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); - struct backing_dev_info *bdi = wb->bdi; long pages_written; - set_worker_desc("flush-%s", dev_name(bdi->dev)); + set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - !test_bit(BDI_registered, &bdi->state))) { + !test_bit(WB_registered, &wb->state))) { /* - * The normal path. Keep writing back @bdi until its + * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken - * if @bdi is shutting down even when we're running off the + * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); - } while (!list_empty(&bdi->work_list)); + } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ - pages_written = writeback_inodes_wb(&bdi->wb, 1024, + pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list)) + if (!list_empty(&wb->work_list)) mod_delayed_work(bdi_wq, &wb->dwork, 0); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) - bdi_wakeup_thread_delayed(bdi); + wb_wakeup_delayed(wb); current->flags &= ~PF_SWAPWRITE; } @@ -1142,9 +1884,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + struct bdi_writeback *wb; + struct wb_iter iter; + if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, reason); + + bdi_for_each_wb(wb, bdi, &iter, 0) + wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), + false, reason); } rcu_read_unlock(); } @@ -1173,9 +1921,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { - if (list_empty(&bdi->wb.b_dirty_time)) - continue; - bdi_wakeup_thread(bdi); + struct bdi_writeback *wb; + struct wb_iter iter; + + bdi_for_each_wb(wb, bdi, &iter, 0) + if (!list_empty(&bdi->wb.b_dirty_time)) + wb_wakeup(&bdi->wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); @@ -1249,7 +2000,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = NULL; int dirtytime; trace_writeback_mark_inode_dirty(inode, flags); @@ -1289,6 +2039,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + inode_attach_wb(inode, NULL); + if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; @@ -1317,38 +2069,39 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + struct bdi_writeback *wb; + struct list_head *dirty_list; bool wakeup_bdi = false; - bdi = inode_to_bdi(inode); - spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); - if (bdi_cap_writeback_dirty(bdi)) { - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi-%s not registered\n", bdi->name); + wb = locked_inode_to_wb_and_lock_list(inode); - /* - * If this is the first dirty inode for this - * bdi, we have to wake-up the corresponding - * bdi thread to make sure background - * write-back happens later. - */ - if (!wb_has_dirty_io(&bdi->wb)) - wakeup_bdi = true; - } + WARN(bdi_cap_writeback_dirty(wb->bdi) && + !test_bit(WB_registered, &wb->state), + "bdi-%s not registered\n", wb->bdi->name); inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + dirty_list = &wb->b_dirty; else - list_move(&inode->i_wb_list, - &bdi->wb.b_dirty_time); - spin_unlock(&bdi->wb.list_lock); + dirty_list = &wb->b_dirty_time; + + wakeup_bdi = inode_wb_list_move_locked(inode, wb, + dirty_list); + + spin_unlock(&wb->list_lock); trace_writeback_dirty_inode_enqueue(inode); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); + /* + * If this is the first dirty inode for this bdi, + * we have to wake-up the corresponding bdi thread + * to make sure background write-back happens + * later. + */ + if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + wb_wakeup_delayed(wb); return; } } @@ -1411,6 +2164,28 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason, bool skip_if_busy) +{ + DEFINE_WB_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, + .reason = reason, + }; + struct backing_dev_info *bdi = sb->s_bdi; + + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) + return; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); + wb_wait_for_completion(bdi, &done); +} + /** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock @@ -1425,21 +2200,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { - DECLARE_COMPLETION_ONSTACK(done); - struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .tagged_writepages = 1, - .done = &done, - .nr_pages = nr, - .reason = reason, - }; - - if (sb->s_bdi == &noop_backing_dev_info) - return; - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + __writeback_inodes_sb_nr(sb, nr, reason, false); } EXPORT_SYMBOL(writeback_inodes_sb_nr); @@ -1467,19 +2228,15 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb_nr if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb_nr(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason) { - if (writeback_in_progress(sb->s_bdi)) - return 1; - if (!down_read_trylock(&sb->s_umount)) - return 0; + return false; - writeback_inodes_sb_nr(sb, nr, reason); + __writeback_inodes_sb_nr(sb, nr, reason, true); up_read(&sb->s_umount); - return 1; + return true; } EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); @@ -1491,7 +2248,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); * Implement by try_to_writeback_inodes_sb_nr() * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) +bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } @@ -1506,7 +2263,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); */ void sync_inodes_sb(struct super_block *sb) { - DECLARE_COMPLETION_ONSTACK(done); + DEFINE_WB_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -1516,14 +2273,15 @@ void sync_inodes_sb(struct super_block *sb) .reason = WB_REASON_SYNC, .for_sync = 1, }; + struct backing_dev_info *bdi = sb->s_bdi; /* Nothing to do? */ - if (sb->s_bdi == &noop_backing_dev_info) + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + bdi_split_work_to_wbs(bdi, &work, false); + wb_wait_for_completion(bdi, &done); wait_sb_inodes(sb); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ef05b5c4cff..8c5e2fa68835 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->writepages_entry); for (i = 0; i < req->num_pages; i++) { - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } @@ -1634,7 +1634,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); goto out; @@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index c18b49dc5d4f..894fb01a91da 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); - if (bdi->dirty_exceeded) + if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 410b65eea683..4574fdd3d421 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 593af2fdcc2d..7302d96ae8bf 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/pagemap.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/vfs.h> diff --git a/fs/inode.c b/fs/inode.c index 6e342cadef81..a049dc467c1a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu); void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode->i_flctx); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 1ba5c97943b8..cfbceb116356 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -845,9 +845,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", __func__, ret); - /* Might as well let the VFS know */ - d_instantiate(new_dentry, d_inode(old_dentry)); - ihold(d_inode(old_dentry)); + /* + * We can't keep the target in dcache after that. + * For one thing, we can't afford dentry aliases for directories. + * For another, if there was a victim, we _can't_ set new inode + * for that sucker and we have to trigger mount eviction - the + * caller won't do it on its own since we are returning an error. + */ + d_invalidate(new_dentry); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/mpage.c b/fs/mpage.c index 3e79220babac..ca0244b69de8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,8 @@ alloc_new: bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; + + wbc_init_bio(wbc, bio); } /* @@ -612,6 +614,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ + wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(WRITE, bio); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 80021c709af9..0c2632386f35 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, d_rehash(newdent); } else { spin_lock(&dentry->d_lock); - NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE; + NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE; spin_unlock(&dentry->d_lock); } } else { diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index fb1fb2774d34..02ec07973bc4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -32,6 +32,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include <linux/sunrpc/metrics.h> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9e6475bc5ba2..7e3c4604bea8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page) struct inode *inode = page_file_mapping(page)->host; inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d9851a6a2813..5051926f6356 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -853,7 +853,8 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); + dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, + WB_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc3a9efdaab8..42468e5ab3e7 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err) const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - /* to be detected by nilfs_segbuf_submit_bio() */ - } - if (!uptodate) atomic_inc(&segbuf->sb_err); @@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - bio_get(bio); submit_bio(mode, bio); segbuf->sb_nbio++; - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - bio_put(bio); - err = -EOPNOTSUPP; - goto failed; - } - bio_put(bio); wi->bio = NULL; wi->rest_blocks -= wi->end - wi->start; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d8b670cbd909..8f1feca89fb0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -37,6 +37,7 @@ #include <linux/falloc.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <cluster/masklog.h> diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 692ceda3bc21..a2b1d7ce3e1a 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) * sole user of this dentry. Too tricky... Just unhash for * now. */ - d_drop(dentry); + if (!err) + d_drop(dentry); mutex_unlock(&dir->i_mutex); return err; @@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (!overwrite && new_is_dir && !old_opaque && new_opaque) ovl_remove_opaque(newdentry); + /* + * Old dentry now lives in different location. Dentries in + * lowerstack are stale. We cannot drop them here because + * access to them is lockless. This could be only pure upper + * or opaque directory - numlower is zero. Or upper non-dir + * entry - its pureness is tracked by flag opaque. + */ if (old_opaque != new_opaque) { ovl_dentry_set_opaque(old, new_opaque); if (!overwrite) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index a1b069e5e363..e505b44a9184 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -66,6 +66,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (upperdentry) { mutex_lock(&upperdentry->d_inode->i_mutex); err = notify_change(upperdentry, attr, NULL); + if (!err) + ovl_copyattr(upperdentry->d_inode, dentry->d_inode); mutex_unlock(&upperdentry->d_inode->i_mutex); } else { err = ovl_copy_up_last(dentry, attr, false); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bd6d5c1e667d..39266655d2bd 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) if (oe->__upperdentry) { type = __OVL_PATH_UPPER; - if (oe->numlower) { - if (S_ISDIR(dentry->d_inode->i_mode)) - type |= __OVL_PATH_MERGE; - } else if (!oe->opaque) { + /* + * Non-dir dentry can hold lower dentry from previous + * location. Its purity depends only on opaque flag. + */ + if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode)) + type |= __OVL_PATH_MERGE; + else if (!oe->opaque) type |= __OVL_PATH_PURE; - } } else { if (oe->numlower > 1) type |= __OVL_PATH_MERGE; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index cf6fa25f884b..9a895b415711 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -21,6 +21,7 @@ #include "xattr.h" #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/quotaops.h> diff --git a/fs/ufs/super.c b/fs/ufs/super.c index dc33f9416340..250579a80d90 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -80,6 +80,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/parser.h> #include <linux/buffer_head.h> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a56960dd1684..feb7e3f01e8e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1874,6 +1874,7 @@ xfs_vm_set_page_dirty( loff_t end_offset; loff_t offset; int newly_dirty; + struct mem_cgroup *memcg; if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -1893,6 +1894,11 @@ xfs_vm_set_page_dirty( offset += 1 << inode->i_blkbits; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); @@ -1903,13 +1909,15 @@ xfs_vm_set_page_dirty( spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } + mem_cgroup_end_page_stat(memcg); + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3b7591224f4a..7c62fca53e2f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -41,6 +41,7 @@ #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> +#include <linux/backing-dev.h> static const struct vm_operations_struct xfs_file_vm_ops; |