aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/btrfs/disk-io.c11
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/buffer.c74
-rw-r--r--fs/debugfs/file.c191
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext4/ext4.h15
-rw-r--r--fs/ext4/extents.c90
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/inode.c71
-rw-r--r--fs/ext4/mballoc.c1
-rw-r--r--fs/ext4/page-io.c9
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/ext4/truncate.h2
-rw-r--r--fs/f2fs/node.c4
-rw-r--r--fs/f2fs/segment.h3
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/fs-writeback.c1168
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/super.c1
-rw-r--r--fs/inode.c1
-rw-r--r--fs/jffs2/dir.c11
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nilfs2/segbuf.c12
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/overlayfs/dir.c10
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/super.c12
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/ufs/super.c1
-rw-r--r--fs/xfs/xfs_aops.c12
-rw-r--r--fs/xfs/xfs_file.c1
39 files changed, 1338 insertions, 412 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ccfd31f1df3a..b1aa9877acba 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -14,6 +14,7 @@
#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
@@ -546,7 +547,8 @@ static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
@@ -687,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode)
return bdev;
}
-int sb_is_blkdev_sb(struct super_block *sb)
-{
- return sb == blockdev_superblock;
-}
-
/* Call when you free inode */
void bd_forget(struct inode *inode)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 99e8f60c7962..ff10d6e093f5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3272,11 +3272,8 @@ static int write_dev_supers(struct btrfs_device *device,
*/
static void btrfs_end_empty_barrier(struct bio *bio, int err)
{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
if (bio->bi_private)
complete(bio->bi_private);
bio_put(bio);
@@ -3304,11 +3301,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
- if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
- rcu_str_deref(device->name));
- device->nobarriers = 1;
- } else if (!bio_flagged(bio, BIO_UPTODATE)) {
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_FLUSH_ERRS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 885f533a34d9..32e0df2d0bd6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
else
btrfsic_submit_bio(rw, bio);
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 2907544c3a1d..8ba5bf01d341 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -30,6 +30,7 @@
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
+#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
@@ -44,6 +45,9 @@
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ unsigned long bio_flags,
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -615,21 +619,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
+ *
+ * The caller must hold mem_cgroup_begin_page_stat() lock.
*/
-static void __set_page_dirty(struct page *page,
- struct address_space *mapping, int warn)
+static void __set_page_dirty(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg, int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
/*
@@ -660,6 +665,7 @@ static void __set_page_dirty(struct page *page,
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
+ struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
@@ -675,11 +681,22 @@ int __set_page_dirty_buffers(struct page *page)
bh = bh->b_this_page;
} while (bh != head);
}
+ /*
+ * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+ * per-memcg dirty page counters.
+ */
+ memcg = mem_cgroup_begin_page_stat(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, 1);
+ __set_page_dirty(page, mapping, memcg, 1);
+
+ mem_cgroup_end_page_stat(memcg);
+
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
return newly_dirty;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1150,11 +1167,18 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
+ struct address_space *mapping = NULL;
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_begin_page_stat(page);
if (!TestSetPageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, 0);
+ __set_page_dirty(page, mapping, memcg, 0);
}
+ mem_cgroup_end_page_stat(memcg);
+ if (mapping)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
}
EXPORT_SYMBOL(mark_buffer_dirty);
@@ -1676,8 +1700,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : WRITE);
+ int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1766,7 +1789,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(write_op, bh);
+ submit_bh_wbc(write_op, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -1820,7 +1843,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh(write_op, bh);
+ submit_bh_wbc(write_op, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -2930,10 +2953,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
{
struct buffer_head *bh = bio->bi_private;
- if (err == -EOPNOTSUPP) {
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- }
-
if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
set_bit(BH_Quiet, &bh->b_state);
@@ -2989,7 +3008,8 @@ void guard_bio_eod(int rw, struct bio *bio)
}
}
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ unsigned long bio_flags, struct writeback_control *wbc)
{
struct bio *bio;
int ret = 0;
@@ -3012,6 +3032,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
*/
bio = bio_alloc(GFP_NOIO, 1);
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, bh->b_page, bh->b_size);
+ }
+
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
@@ -3033,20 +3058,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
if (buffer_prio(bh))
rw |= REQ_PRIO;
- bio_get(bio);
submit_bio(rw, bio);
-
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
-
- bio_put(bio);
return ret;
}
+
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+ return submit_bh_wbc(rw, bh, bio_flags, NULL);
+}
EXPORT_SYMBOL_GPL(_submit_bh);
int submit_bh(int rw, struct buffer_head *bh)
{
- return _submit_bh(rw, bh, 0);
+ return submit_bh_wbc(rw, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3235,8 +3259,8 @@ int try_to_free_buffers(struct page *page)
* to synchronise against __set_page_dirty_buffers and prevent the
* dirty bit from being lost.
*/
- if (ret && TestClearPageDirty(page))
- account_page_cleaned(page, mapping);
+ if (ret)
+ cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 830a7e76f5c6..39916043da10 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -54,6 +54,22 @@ const struct inode_operations debugfs_link_operations = {
.follow_link = debugfs_follow_link,
};
+static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
+ struct dentry *parent, void *value,
+ const struct file_operations *fops,
+ const struct file_operations *fops_ro,
+ const struct file_operations *fops_wo)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_wo);
+
+ return debugfs_create_file(name, mode, parent, value, fops);
+}
+
static int debugfs_u8_set(void *data, u64 val)
{
*(u8 *)data = val;
@@ -95,14 +111,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+ &fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -147,14 +157,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+ &fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -199,14 +203,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+ &fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -252,17 +250,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+ &fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
+static int debugfs_ulong_set(void *data, u64 val)
+{
+ *(unsigned long *)data = val;
+ return 0;
+}
+
+static int debugfs_ulong_get(void *data, u64 *val)
+{
+ *val = *(unsigned long *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+
+/**
+ * debugfs_create_ulong - create a debugfs file that is used to read and write
+ * an unsigned long value.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value. If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned. It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
+ struct dentry *parent, unsigned long *value)
+{
+ return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
+ &fops_ulong_ro, &fops_ulong_wo);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_ulong);
+
DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
@@ -276,6 +316,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
/*
* debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -298,14 +340,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+ &fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -322,14 +358,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+ &fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -346,14 +376,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+ &fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -370,7 +394,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_x64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+ &fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -387,6 +412,8 @@ static int debugfs_size_t_get(void *data, u64 *val)
}
DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
"%llu\n"); /* %llu and %zu are more or less the same */
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
/**
* debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -401,7 +428,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
+ &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -434,24 +462,16 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
+ &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
-static ssize_t read_file_bool(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[3];
- u32 *val = file->private_data;
+ bool *val = file->private_data;
if (*val)
buf[0] = 'Y';
@@ -461,14 +481,15 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
+EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
-static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[32];
size_t buf_size;
bool bv;
- u32 *val = file->private_data;
+ bool *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
@@ -480,10 +501,23 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
return count;
}
+EXPORT_SYMBOL_GPL(debugfs_write_file_bool);
static const struct file_operations fops_bool = {
- .read = read_file_bool,
- .write = write_file_bool,
+ .read = debugfs_read_file_bool,
+ .write = debugfs_write_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_ro = {
+ .read = debugfs_read_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_wo = {
+ .write = debugfs_write_file_bool,
.open = simple_open,
.llseek = default_llseek,
};
@@ -513,9 +547,10 @@ static const struct file_operations fops_bool = {
* code.
*/
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+ struct dentry *parent, bool *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_bool);
+ return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+ &fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d0e746e96511..900e19cf9ef6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
MS_POSIXACL : 0);
+ sb->s_iflags |= SB_I_CGROUPWB;
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9a83f149ac85..8b033f24a6b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -191,7 +191,7 @@ typedef struct ext4_io_end {
} ext4_io_end_t;
struct ext4_io_submit {
- int io_op;
+ struct writeback_control *io_wbc;
struct bio *io_bio;
ext4_io_end_t *io_end;
sector_t io_next_block;
@@ -873,6 +873,15 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
+ /*
+ * i_mmap_sem is for serializing page faults with truncate / punch hole
+ * operations. We have to make sure that new page cannot be faulted in
+ * a section of the inode that is being punched. We cannot easily use
+ * i_data_sem for this since we need protection for the whole punch
+ * operation and i_data_sem ranks below transaction start so we have
+ * to occasionally drop it.
+ */
+ struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -2287,6 +2296,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
@@ -2632,6 +2642,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
return changed;
}
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len);
+
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 87ba10d1d3bc..677955986ad7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/fiemap.h>
+#include <linux/backing-dev.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -4741,7 +4742,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
- struct address_space *mapping = inode->i_mapping;
unsigned int blkbits = inode->i_blkbits;
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4757,17 +4757,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + len - 1);
- if (ret)
- return ret;
- }
-
- /*
* Round up offset. This is not fallocate, we neet to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
@@ -4810,6 +4799,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4818,7 +4811,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
round_down(offset, 1 << blkbits)) >> blkbits,
new_size, flags, mode);
if (ret)
- goto out_mutex;
+ goto out_dio;
}
@@ -4827,16 +4820,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
- /* Now release the pages and zero block aligned part of pages*/
+ /*
+ * Prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ goto out_dio;
+ }
+ /* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
goto out_dio;
}
@@ -4964,8 +4964,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ ext4_inode_resume_unlocked_dio(inode);
if (ret)
goto out;
@@ -5424,21 +5429,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down offset to be aligned with page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
mutex_lock(&inode->i_mutex);
-
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
@@ -5454,17 +5445,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /*
+ * Write tail of the last page before removed range since it will get
+ * removed from the page cache below.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ if (ret)
+ goto out_mmap;
+ /*
+ * Write data that will be shifted to preserve them when discarding
+ * page cache below. We are also protected from pages becoming dirty
+ * by i_mmap_sem.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5503,7 +5520,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0613c256c344..dd65fac5ff2f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -213,7 +213,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b3a53a51582..3291e1af0e24 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3524,6 +3524,35 @@ int ext4_can_truncate(struct inode *inode)
}
/*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ loff_t size = i_size_read(inode);
+
+ WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ if (offset > size || offset + len < size)
+ return 0;
+
+ if (EXT4_I(inode)->i_disksize >= size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ext4_update_i_disksize(inode, size);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return 0;
+}
+
+/*
* ext4_punch_hole: punches a hole in a file by releaseing the blocks
* associated with the given offset and length
*
@@ -3588,17 +3617,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset)
+ if (last_block_offset > first_block_offset) {
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
+ if (ret)
+ goto out_dio;
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
+ }
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -3645,16 +3683,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- /* Now release the pages again to reduce race window */
- if (last_block_offset > first_block_offset)
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
-
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
out_dio:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
@@ -4775,11 +4809,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
} else
ext4_wait_for_tail_page_commit(inode);
}
+ down_write(&EXT4_I(inode)->i_mmap_sem);
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==
@@ -5234,6 +5270,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
@@ -5303,6 +5341,19 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int err;
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_fault(vma, vmf);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return err;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 41260489d3bc..5b1613a54307 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -26,6 +26,7 @@
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <trace/events/ext4.h>
#ifdef CONFIG_EXT4_DEBUG
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 8082565c59a9..00cda09e4412 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -357,9 +357,10 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
+ int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE;
bio_get(io->io_bio);
- submit_bio(io->io_op, io->io_bio);
- BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+ submit_bio(io_op, io->io_bio);
bio_put(io->io_bio);
}
io->io_bio = NULL;
@@ -368,7 +369,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_wbc = wbc;
io->io_bio = NULL;
io->io_end = NULL;
}
@@ -382,6 +383,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
if (!bio)
return -ENOMEM;
+ wbc_init_bio(io->io_wbc, bio);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
@@ -410,6 +412,7 @@ submit_and_retry:
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
+ wbc_account_io(io->io_wbc, page, bh->b_size);
io->io_next_block++;
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a3b9f14d198..abb7ec3f19b9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
@@ -945,6 +946,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
+ init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
}
@@ -3649,6 +3651,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
}
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 011ba6670d99..c70d06a383e2 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -10,8 +10,10 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8ab0cf1930bd..d211602e0f86 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DIRTY_DENTS) {
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else {
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
}
return res;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 85d7fa7514b2..aba72f7a8ac4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -9,6 +9,7 @@
* published by the Free Software Foundation.
*/
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
/* constant macro */
#define NULL_SEGNO ((unsigned int)(~0))
@@ -713,7 +714,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
*/
static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
{
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return 0;
if (type == DATA)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 442d50a0e33e..a08f1039909a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include "fat.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c06774658345..509411dd3698 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,6 +18,7 @@
#include <linux/parser.h>
#include <linux/uio.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <asm/unaligned.h>
#include "fat.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 32a8bbd7a9ad..518c6294bf6c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
+#include <linux/memcontrol.h>
#include "internal.h"
/*
@@ -34,6 +35,10 @@
*/
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
+struct wb_completion {
+ atomic_t cnt;
+};
+
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
@@ -47,13 +52,29 @@ struct wb_writeback_work {
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
+ unsigned int auto_free:1; /* free on completion */
+ unsigned int single_wait:1;
+ unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
- struct completion *done; /* set if the caller waits */
+ struct wb_completion *done; /* set if the caller waits */
};
/*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro. Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion(). Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
+ struct wb_completion cmpl = { \
+ .cnt = ATOMIC_INIT(1), \
+ }
+
+
+/*
* If an inode is constantly having its pages dirtied, but then the
* updates stop dirtytime_expire_interval seconds in the past, it's
* possible for the worst case time between when an inode has its
@@ -65,35 +86,6 @@ struct wb_writeback_work {
*/
unsigned int dirtytime_expire_interval = 12 * 60 * 60;
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback waiting to be handled against a
- * backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
- return test_bit(BDI_writeback_running, &bdi->state);
-}
-EXPORT_SYMBOL(writeback_in_progress);
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
- struct super_block *sb;
-
- if (!inode)
- return &noop_backing_dev_info;
-
- sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
- if (sb_is_blkdev_sb(sb))
- return blk_get_backing_dev_info(I_BDEV(inode));
-#endif
- return sb->s_bdi;
-}
-EXPORT_SYMBOL_GPL(inode_to_bdi);
-
static inline struct inode *wb_inode(struct list_head *head)
{
return list_entry(head, struct inode, i_wb_list);
@@ -109,45 +101,831 @@ static inline struct inode *wb_inode(struct list_head *head)
EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
-static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+static bool wb_io_lists_populated(struct bdi_writeback *wb)
+{
+ if (wb_has_dirty_io(wb)) {
+ return false;
+ } else {
+ set_bit(WB_has_dirty_io, &wb->state);
+ WARN_ON_ONCE(!wb->avg_write_bandwidth);
+ atomic_long_add(wb->avg_write_bandwidth,
+ &wb->bdi->tot_write_bandwidth);
+ return true;
+ }
+}
+
+static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
- spin_lock_bh(&bdi->wb_lock);
- if (test_bit(BDI_registered, &bdi->state))
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
- spin_unlock_bh(&bdi->wb_lock);
+ if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
+ list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
+ clear_bit(WB_has_dirty_io, &wb->state);
+ WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
+ &wb->bdi->tot_write_bandwidth) < 0);
+ }
}
-static void bdi_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
+/**
+ * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * @inode: inode to be moved
+ * @wb: target bdi_writeback
+ * @head: one of @wb->b_{dirty|io|more_io}
+ *
+ * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Returns %true if @inode is the first occupant of the !dirty_time IO
+ * lists; otherwise, %false.
+ */
+static bool inode_wb_list_move_locked(struct inode *inode,
+ struct bdi_writeback *wb,
+ struct list_head *head)
{
- trace_writeback_queue(bdi, work);
+ assert_spin_locked(&wb->list_lock);
+
+ list_move(&inode->i_wb_list, head);
- spin_lock_bh(&bdi->wb_lock);
- if (!test_bit(BDI_registered, &bdi->state)) {
- if (work->done)
- complete(work->done);
+ /* dirty_time doesn't count as dirty_io until expiration */
+ if (head != &wb->b_dirty_time)
+ return wb_io_lists_populated(wb);
+
+ wb_io_lists_depopulated(wb);
+ return false;
+}
+
+/**
+ * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * @inode: inode to be removed
+ * @wb: bdi_writeback @inode is being removed from
+ *
+ * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
+ * clear %WB_has_dirty_io if all are empty afterwards.
+ */
+static void inode_wb_list_del_locked(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+
+ list_del_init(&inode->i_wb_list);
+ wb_io_lists_depopulated(wb);
+}
+
+static void wb_wakeup(struct bdi_writeback *wb)
+{
+ spin_lock_bh(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ spin_unlock_bh(&wb->work_lock);
+}
+
+static void wb_queue_work(struct bdi_writeback *wb,
+ struct wb_writeback_work *work)
+{
+ trace_writeback_queue(wb->bdi, work);
+
+ spin_lock_bh(&wb->work_lock);
+ if (!test_bit(WB_registered, &wb->state)) {
+ if (work->single_wait)
+ work->single_done = 1;
goto out_unlock;
}
- list_add_tail(&work->list, &bdi->work_list);
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ if (work->done)
+ atomic_inc(&work->done->cnt);
+ list_add_tail(&work->list, &wb->work_list);
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
out_unlock:
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
+}
+
+/**
+ * wb_wait_for_completion - wait for completion of bdi_writeback_works
+ * @bdi: bdi work items were issued to
+ * @done: target wb_completion
+ *
+ * Wait for one or more work items issued to @bdi with their ->done field
+ * set to @done, which should have been defined with
+ * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
+ * work items are completed. Work items which are waited upon aren't freed
+ * automatically on completion.
+ */
+static void wb_wait_for_completion(struct backing_dev_info *bdi,
+ struct wb_completion *done)
+{
+ atomic_dec(&done->cnt); /* put down the initial count */
+ wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
+
+#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+ /* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
+ /* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
+ /* one round can affect upto 5 slots */
+
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb = NULL;
+
+ if (inode_cgwb_enabled(inode)) {
+ struct cgroup_subsys_state *memcg_css;
+
+ if (page) {
+ memcg_css = mem_cgroup_css_from_page(page);
+ wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ } else {
+ /* must pin memcg_css, see wb_get_create() */
+ memcg_css = task_get_css(current, memory_cgrp_id);
+ wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ css_put(memcg_css);
+ }
+ }
+
+ if (!wb)
+ wb = &bdi->wb;
+
+ /*
+ * There may be multiple instances of this function racing to
+ * update the same inode. Use cmpxchg() to tell the winner.
+ */
+ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+ wb_put(wb);
+}
+
+/**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held. @inode->i_lock must be
+ * held on entry and is released on return. The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+ __releases(&inode->i_lock)
+ __acquires(&wb->list_lock)
+{
+ while (true) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ /*
+ * inode_to_wb() association is protected by both
+ * @inode->i_lock and @wb->list_lock but list_lock nests
+ * outside i_lock. Drop i_lock and verify that the
+ * association hasn't changed after acquiring list_lock.
+ */
+ wb_get(wb);
+ spin_unlock(&inode->i_lock);
+ spin_lock(&wb->list_lock);
+ wb_put(wb); /* not gonna deref it anymore */
+
+ /* i_wb may have changed inbetween, can't use inode_to_wb() */
+ if (likely(wb == inode->i_wb))
+ return wb; /* @inode already has ref */
+
+ spin_unlock(&wb->list_lock);
+ cpu_relax();
+ spin_lock(&inode->i_lock);
+ }
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+ __acquires(&wb->list_lock)
+{
+ spin_lock(&inode->i_lock);
+ return locked_inode_to_wb_and_lock_list(inode);
+}
+
+struct inode_switch_wbs_context {
+ struct inode *inode;
+ struct bdi_writeback *new_wb;
+
+ struct rcu_head rcu_head;
+ struct work_struct work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct inode_switch_wbs_context *isw =
+ container_of(work, struct inode_switch_wbs_context, work);
+ struct inode *inode = isw->inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct bdi_writeback *old_wb = inode->i_wb;
+ struct bdi_writeback *new_wb = isw->new_wb;
+ struct radix_tree_iter iter;
+ bool switched = false;
+ void **slot;
+
+ /*
+ * By the time control reaches here, RCU grace period has passed
+ * since I_WB_SWITCH assertion and all wb stat update transactions
+ * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+ * synchronizing against mapping->tree_lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
+ */
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
+ spin_lock(&inode->i_lock);
+ spin_lock_irq(&mapping->tree_lock);
+
+ /*
+ * Once I_FREEING is visible under i_lock, the eviction path owns
+ * the inode and we shouldn't modify ->i_wb_list.
+ */
+ if (unlikely(inode->i_state & I_FREEING))
+ goto skip_switch;
+
+ /*
+ * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
+ * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+ * pages actually under underwriteback.
+ */
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_DIRTY) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page) && PageDirty(page)) {
+ __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+ __inc_wb_stat(new_wb, WB_RECLAIMABLE);
+ }
+ }
+
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_WRITEBACK) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page)) {
+ WARN_ON_ONCE(!PageWriteback(page));
+ __dec_wb_stat(old_wb, WB_WRITEBACK);
+ __inc_wb_stat(new_wb, WB_WRITEBACK);
+ }
+ }
+
+ wb_get(new_wb);
+
+ /*
+ * Transfer to @new_wb's IO list if necessary. The specific list
+ * @inode was on is ignored and the inode is put on ->b_dirty which
+ * is always correct including from ->b_dirty_time. The transfer
+ * preserves @inode->dirtied_when ordering.
+ */
+ if (!list_empty(&inode->i_wb_list)) {
+ struct inode *pos;
+
+ inode_wb_list_del_locked(inode, old_wb);
+ inode->i_wb = new_wb;
+ list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+ if (time_after_eq(inode->dirtied_when,
+ pos->dirtied_when))
+ break;
+ inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+ } else {
+ inode->i_wb = new_wb;
+ }
+
+ /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
+ inode->i_wb_frn_winner = 0;
+ inode->i_wb_frn_avg_time = 0;
+ inode->i_wb_frn_history = 0;
+ switched = true;
+skip_switch:
+ /*
+ * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+ * ensures that the new wb is visible if they see !I_WB_SWITCH.
+ */
+ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&new_wb->list_lock);
+ spin_unlock(&old_wb->list_lock);
+
+ if (switched) {
+ wb_wakeup(new_wb);
+ wb_put(old_wb);
+ }
+ wb_put(new_wb);
+
+ iput(inode);
+ kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct inode_switch_wbs_context *isw = container_of(rcu_head,
+ struct inode_switch_wbs_context, rcu_head);
+
+ /* needs to grab bh-unsafe locks, bounce to work item */
+ INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+ schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id. The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct cgroup_subsys_state *memcg_css;
+ struct inode_switch_wbs_context *isw;
+
+ /* noop if seems to be already in progress */
+ if (inode->i_state & I_WB_SWITCH)
+ return;
+
+ isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+ if (!isw)
+ return;
+
+ /* find and pin the new wb */
+ rcu_read_lock();
+ memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+ if (memcg_css)
+ isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ rcu_read_unlock();
+ if (!isw->new_wb)
+ goto out_free;
+
+ /* while holding I_WB_SWITCH, no one else can update the association */
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ inode_to_wb(inode) == isw->new_wb) {
+ spin_unlock(&inode->i_lock);
+ goto out_free;
+ }
+ inode->i_state |= I_WB_SWITCH;
+ spin_unlock(&inode->i_lock);
+
+ ihold(inode);
+ isw->inode = inode;
+
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH tells
+ * the RCU protected stat update paths to grab the mapping's
+ * tree_lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+ */
+ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+ return;
+
+out_free:
+ if (isw->new_wb)
+ wb_put(isw->new_wb);
+ kfree(isw);
+}
+
+/**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock. On
+ * writeback completion, wbc_detach_inode() should be called. This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ if (!inode_cgwb_enabled(inode)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ wbc->wb = inode_to_wb(inode);
+ wbc->inode = inode;
+
+ wbc->wb_id = wbc->wb->memcg_css->id;
+ wbc->wb_lcand_id = inode->i_wb_frn_winner;
+ wbc->wb_tcand_id = 0;
+ wbc->wb_bytes = 0;
+ wbc->wb_lcand_bytes = 0;
+ wbc->wb_tcand_bytes = 0;
+
+ wb_get(wbc->wb);
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * A dying wb indicates that the memcg-blkcg mapping has changed
+ * and a new wb is already serving the memcg. Switch immediately.
+ */
+ if (unlikely(wb_dying(wbc->wb)))
+ inode_switch_wbs(inode, wbc->wb_id);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode(). Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode. While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it. To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm. In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate). Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history. If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+ struct bdi_writeback *wb = wbc->wb;
+ struct inode *inode = wbc->inode;
+ unsigned long avg_time, max_bytes, max_time;
+ u16 history;
+ int max_id;
+
+ if (!wb)
+ return;
+
+ history = inode->i_wb_frn_history;
+ avg_time = inode->i_wb_frn_avg_time;
+
+ /* pick the winner of this round */
+ if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+ wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+ max_id = wbc->wb_id;
+ max_bytes = wbc->wb_bytes;
+ } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+ max_id = wbc->wb_lcand_id;
+ max_bytes = wbc->wb_lcand_bytes;
+ } else {
+ max_id = wbc->wb_tcand_id;
+ max_bytes = wbc->wb_tcand_bytes;
+ }
+
+ /*
+ * Calculate the amount of IO time the winner consumed and fold it
+ * into the running average kept per inode. If the consumed IO
+ * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+ * deciding whether to switch or not. This is to prevent one-off
+ * small dirtiers from skewing the verdict.
+ */
+ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+ wb->avg_write_bandwidth);
+ if (avg_time)
+ avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+ (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+ else
+ avg_time = max_time; /* immediate catch up on first run */
+
+ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+ int slots;
+
+ /*
+ * The switch verdict is reached if foreign wb's consume
+ * more than a certain proportion of IO time in a
+ * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
+ * history mask where each bit represents one sixteenth of
+ * the period. Determine the number of slots to shift into
+ * history from @max_time.
+ */
+ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+ (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+ history <<= slots;
+ if (wbc->wb_id != max_id)
+ history |= (1U << slots) - 1;
+
+ /*
+ * Switch if the current wb isn't the consistent winner.
+ * If there are multiple closely competing dirtiers, the
+ * inode may switch across them repeatedly over time, which
+ * is okay. The main goal is avoiding keeping an inode on
+ * the wrong wb for an extended period of time.
+ */
+ if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+ inode_switch_wbs(inode, max_id);
+ }
+
+ /*
+ * Multiple instances of this function may race to update the
+ * following fields but we don't mind occassional inaccuracies.
+ */
+ inode->i_wb_frn_winner = max_id;
+ inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+ inode->i_wb_frn_history = history;
+
+ wb_put(wbc->wb);
+ wbc->wb = NULL;
+}
+
+/**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc. Keep the book for foreign inode detection. See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+ size_t bytes)
+{
+ int id;
+
+ /*
+ * pageout() path doesn't attach @wbc to the inode being written
+ * out. This is intentional as we don't want the function to block
+ * behind a slow cgroup. Ultimately, we want pageout() to kick off
+ * regular writeback instead of writing things out itself.
+ */
+ if (!wbc->wb)
+ return;
+
+ rcu_read_lock();
+ id = mem_cgroup_css_from_page(page)->id;
+ rcu_read_unlock();
+
+ if (id == wbc->wb_id) {
+ wbc->wb_bytes += bytes;
+ return;
+ }
+
+ if (id == wbc->wb_lcand_id)
+ wbc->wb_lcand_bytes += bytes;
+
+ /* Boyer-Moore majority vote algorithm */
+ if (!wbc->wb_tcand_bytes)
+ wbc->wb_tcand_id = id;
+ if (id == wbc->wb_tcand_id)
+ wbc->wb_tcand_bytes += bytes;
+ else
+ wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
+EXPORT_SYMBOL_GPL(wbc_account_io);
-static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- bool range_cyclic, enum wb_reason reason)
+/**
+ * inode_congested - test whether an inode is congested
+ * @inode: inode to test for congestion
+ * @cong_bits: mask of WB_[a]sync_congested bits to test
+ *
+ * Tests whether @inode is congested. @cong_bits is the mask of congestion
+ * bits to test and the return value is the mask of set bits.
+ *
+ * If cgroup writeback is enabled for @inode, the congestion state is
+ * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
+ * associated with @inode is congested; otherwise, the root wb's congestion
+ * state is used.
+ */
+int inode_congested(struct inode *inode, int cong_bits)
+{
+ /*
+ * Once set, ->i_wb never becomes NULL while the inode is alive.
+ * Start transaction iff ->i_wb is visible.
+ */
+ if (inode && inode_to_wb_is_valid(inode)) {
+ struct bdi_writeback *wb;
+ bool locked, congested;
+
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
+ congested = wb_congested(wb, cong_bits);
+ unlocked_inode_to_wb_end(inode, locked);
+ return congested;
+ }
+
+ return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+EXPORT_SYMBOL_GPL(inode_congested);
+
+/**
+ * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
+ * @bdi: bdi the work item was issued to
+ * @work: work item to wait for
+ *
+ * Wait for the completion of @work which was issued to one of @bdi's
+ * bdi_writeback's. The caller must have set @work->single_wait before
+ * issuing it. This wait operates independently fo
+ * wb_wait_for_completion() and also disables automatic freeing of @work.
+ */
+static void wb_wait_for_single_work(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
+{
+ if (WARN_ON_ONCE(!work->single_wait))
+ return;
+
+ wait_event(bdi->wb_waitq, work->single_done);
+
+ /*
+ * Paired with smp_wmb() in wb_do_writeback() and ensures that all
+ * modifications to @work prior to assertion of ->single_done is
+ * visible to the caller once this function returns.
+ */
+ smp_rmb();
+}
+
+/**
+ * wb_split_bdi_pages - split nr_pages to write according to bandwidth
+ * @wb: target bdi_writeback to split @nr_pages to
+ * @nr_pages: number of pages to write for the whole bdi
+ *
+ * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
+ * relation to the total write bandwidth of all wb's w/ dirty inodes on
+ * @wb->bdi.
+ */
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+ unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+ if (nr_pages == LONG_MAX)
+ return LONG_MAX;
+
+ /*
+ * This may be called on clean wb's and proportional distribution
+ * may not make sense, just use the original @nr_pages in those
+ * cases. In general, we wanna err on the side of writing more.
+ */
+ if (!tot_bw || this_bw >= tot_bw)
+ return nr_pages;
+ else
+ return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
+}
+
+/**
+ * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
+ * @wb: target bdi_writeback
+ * @base_work: source wb_writeback_work
+ *
+ * Try to make a clone of @base_work and issue it to @wb. If cloning
+ * succeeds, %true is returned; otherwise, @base_work is issued directly
+ * and %false is returned. In the latter case, the caller is required to
+ * wait for @base_work's completion using wb_wait_for_single_work().
+ *
+ * A clone is auto-freed on completion. @base_work never is.
+ */
+static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
+ struct wb_writeback_work *base_work)
{
struct wb_writeback_work *work;
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->auto_free = 1;
+ work->single_wait = 0;
+ } else {
+ work = base_work;
+ work->auto_free = 0;
+ work->single_wait = 1;
+ }
+ work->single_done = 0;
+ wb_queue_work(wb, work);
+ return work != base_work;
+}
+
+/**
+ * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
+ * @bdi: target backing_dev_info
+ * @base_work: wb_writeback_work to issue
+ * @skip_if_busy: skip wb's which already have writeback in progress
+ *
+ * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
+ * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's
+ * distributed to the busy wbs according to each wb's proportion in the
+ * total active write bandwidth of @bdi.
+ */
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+ struct wb_writeback_work *base_work,
+ bool skip_if_busy)
+{
+ long nr_pages = base_work->nr_pages;
+ int next_blkcg_id = 0;
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
+ might_sleep();
+
+ if (!bdi_has_dirty_io(bdi))
+ return;
+restart:
+ rcu_read_lock();
+ bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ if (!wb_has_dirty_io(wb) ||
+ (skip_if_busy && writeback_in_progress(wb)))
+ continue;
+
+ base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
+ if (!wb_clone_and_queue_work(wb, base_work)) {
+ next_blkcg_id = wb->blkcg_css->id + 1;
+ rcu_read_unlock();
+ wb_wait_for_single_work(bdi, base_work);
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+ __releases(&inode->i_lock)
+ __acquires(&wb->list_lock)
+{
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ spin_unlock(&inode->i_lock);
+ spin_lock(&wb->list_lock);
+ return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+ __acquires(&wb->list_lock)
+{
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ spin_lock(&wb->list_lock);
+ return wb;
+}
+
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+ return nr_pages;
+}
+
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+ struct wb_writeback_work *base_work,
+ bool skip_if_busy)
+{
+ might_sleep();
+
+ if (bdi_has_dirty_io(bdi) &&
+ (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+ base_work->auto_free = 0;
+ base_work->single_wait = 0;
+ base_work->single_done = 0;
+ wb_queue_work(&bdi->wb, base_work);
+ }
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+ bool range_cyclic, enum wb_reason reason)
+{
+ struct wb_writeback_work *work;
+
+ if (!wb_has_dirty_io(wb))
+ return;
+
/*
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(bdi);
- bdi_wakeup_thread(bdi);
+ trace_writeback_nowork(wb->bdi);
+ wb_wakeup(wb);
return;
}
@@ -155,46 +933,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
work->reason = reason;
+ work->auto_free = 1;
- bdi_queue_work(bdi, work);
+ wb_queue_work(wb, work);
}
/**
- * bdi_start_writeback - start writeback
- * @bdi: the backing device to write from
- * @nr_pages: the number of pages to write
- * @reason: reason why some writeback work was initiated
- *
- * Description:
- * This does WB_SYNC_NONE opportunistic writeback. The IO is only
- * started when this function returns, we make no guarantees on
- * completion. Caller need not hold sb s_umount semaphore.
- *
- */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- enum wb_reason reason)
-{
- __bdi_start_writeback(bdi, nr_pages, true, reason);
-}
-
-/**
- * bdi_start_background_writeback - start background writeback
- * @bdi: the backing device to write from
+ * wb_start_background_writeback - start background writeback
+ * @wb: bdi_writback to write from
*
* Description:
* This makes sure WB_SYNC_NONE background writeback happens. When
- * this function returns, it is only guaranteed that for given BDI
+ * this function returns, it is only guaranteed that for given wb
* some IO is happening if we are over background dirty threshold.
* Caller need not hold sb s_umount semaphore.
*/
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void wb_start_background_writeback(struct bdi_writeback *wb)
{
/*
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(bdi);
- bdi_wakeup_thread(bdi);
+ trace_writeback_wake_background(wb->bdi);
+ wb_wakeup(wb);
}
/*
@@ -202,11 +963,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/
void inode_wb_list_del(struct inode *inode)
{
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
- spin_lock(&bdi->wb.list_lock);
- list_del_init(&inode->i_wb_list);
- spin_unlock(&bdi->wb.list_lock);
+ wb = inode_to_wb_and_lock_list(inode);
+ inode_wb_list_del_locked(inode, wb);
+ spin_unlock(&wb->list_lock);
}
/*
@@ -220,7 +981,6 @@ void inode_wb_list_del(struct inode *inode)
*/
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
- assert_spin_locked(&wb->list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -228,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_wb_list, &wb->b_dirty);
+ inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
}
/*
@@ -236,8 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
*/
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- assert_spin_locked(&wb->list_lock);
- list_move(&inode->i_wb_list, &wb->b_more_io);
+ inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -346,6 +1105,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
EXPIRE_DIRTY_ATIME, work);
+ if (moved)
+ wb_io_lists_populated(wb);
trace_writeback_queue_io(wb, work, moved);
}
@@ -471,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
- list_move(&inode->i_wb_list, &wb->b_dirty_time);
+ inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
- list_del_init(&inode->i_wb_list);
+ inode_wb_list_del_locked(inode, wb);
}
}
@@ -605,10 +1366,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
inode->i_state |= I_SYNC;
- spin_unlock(&inode->i_lock);
+ wbc_attach_and_unlock_inode(wbc, inode);
ret = __writeback_single_inode(inode, wbc);
+ wbc_detach_inode(wbc);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
/*
@@ -616,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* touch it. See comment above for explanation.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- list_del_init(&inode->i_wb_list);
+ inode_wb_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -624,7 +1386,7 @@ out:
return ret;
}
-static long writeback_chunk_size(struct backing_dev_info *bdi,
+static long writeback_chunk_size(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
long pages;
@@ -645,8 +1407,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
pages = LONG_MAX;
else {
- pages = min(bdi->avg_write_bandwidth / 2,
- global_dirty_limit / DIRTY_SCOPE);
+ pages = min(wb->avg_write_bandwidth / 2,
+ global_wb_domain.dirty_limit / DIRTY_SCOPE);
pages = min(pages, work->nr_pages);
pages = round_down(pages + MIN_WRITEBACK_PAGES,
MIN_WRITEBACK_PAGES);
@@ -741,9 +1503,9 @@ static long writeback_sb_inodes(struct super_block *sb,
continue;
}
inode->i_state |= I_SYNC;
- spin_unlock(&inode->i_lock);
+ wbc_attach_and_unlock_inode(&wbc, inode);
- write_chunk = writeback_chunk_size(wb->bdi, work);
+ write_chunk = writeback_chunk_size(wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
@@ -753,6 +1515,7 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
__writeback_single_inode(inode, &wbc);
+ wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
@@ -830,33 +1593,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
return nr_pages - work.nr_pages;
}
-static bool over_bground_thresh(struct backing_dev_info *bdi)
-{
- unsigned long background_thresh, dirty_thresh;
-
- global_dirty_limits(&background_thresh, &dirty_thresh);
-
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) > background_thresh)
- return true;
-
- if (bdi_stat(bdi, BDI_RECLAIMABLE) >
- bdi_dirty_limit(bdi, background_thresh))
- return true;
-
- return false;
-}
-
-/*
- * Called under wb->list_lock. If there are multiple wb per bdi,
- * only the flusher working on the first wb should do it.
- */
-static void wb_update_bandwidth(struct bdi_writeback *wb,
- unsigned long start_time)
-{
- __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
-}
-
/*
* Explicit flushing or periodic writeback of "old" data.
*
@@ -899,14 +1635,14 @@ static long wb_writeback(struct bdi_writeback *wb,
* after the other works are all done.
*/
if ((work->for_background || work->for_kupdate) &&
- !list_empty(&wb->bdi->work_list))
+ !list_empty(&wb->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
- if (work->for_background && !over_bground_thresh(wb->bdi))
+ if (work->for_background && !wb_over_bg_thresh(wb))
break;
/*
@@ -970,18 +1706,17 @@ static long wb_writeback(struct bdi_writeback *wb,
/*
* Return the next wb_writeback_work struct that hasn't been processed yet.
*/
-static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi)
+static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
struct wb_writeback_work *work = NULL;
- spin_lock_bh(&bdi->wb_lock);
- if (!list_empty(&bdi->work_list)) {
- work = list_entry(bdi->work_list.next,
+ spin_lock_bh(&wb->work_lock);
+ if (!list_empty(&wb->work_list)) {
+ work = list_entry(wb->work_list.next,
struct wb_writeback_work, list);
list_del_init(&work->list);
}
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
return work;
}
@@ -998,7 +1733,7 @@ static unsigned long get_nr_dirty_pages(void)
static long wb_check_background_flush(struct bdi_writeback *wb)
{
- if (over_bground_thresh(wb->bdi)) {
+ if (wb_over_bg_thresh(wb)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
@@ -1053,25 +1788,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
*/
static long wb_do_writeback(struct bdi_writeback *wb)
{
- struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
- set_bit(BDI_writeback_running, &wb->bdi->state);
- while ((work = get_next_work_item(bdi)) != NULL) {
+ set_bit(WB_writeback_running, &wb->state);
+ while ((work = get_next_work_item(wb)) != NULL) {
+ struct wb_completion *done = work->done;
+ bool need_wake_up = false;
- trace_writeback_exec(bdi, work);
+ trace_writeback_exec(wb->bdi, work);
wrote += wb_writeback(wb, work);
- /*
- * Notify the caller of completion if this is a synchronous
- * work item, otherwise just free it.
- */
- if (work->done)
- complete(work->done);
- else
+ if (work->single_wait) {
+ WARN_ON_ONCE(work->auto_free);
+ /* paired w/ rmb in wb_wait_for_single_work() */
+ smp_wmb();
+ work->single_done = 1;
+ need_wake_up = true;
+ } else if (work->auto_free) {
kfree(work);
+ }
+
+ if (done && atomic_dec_and_test(&done->cnt))
+ need_wake_up = true;
+
+ if (need_wake_up)
+ wake_up_all(&wb->bdi->wb_waitq);
}
/*
@@ -1079,7 +1822,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
- clear_bit(BDI_writeback_running, &wb->bdi->state);
+ clear_bit(WB_writeback_running, &wb->state);
return wrote;
}
@@ -1088,43 +1831,42 @@ static long wb_do_writeback(struct bdi_writeback *wb)
* Handle writeback of dirty data for the device backed by this bdi. Also
* reschedules periodically and does kupdated style flushing.
*/
-void bdi_writeback_workfn(struct work_struct *work)
+void wb_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
- struct backing_dev_info *bdi = wb->bdi;
long pages_written;
- set_worker_desc("flush-%s", dev_name(bdi->dev));
+ set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
- !test_bit(BDI_registered, &bdi->state))) {
+ !test_bit(WB_registered, &wb->state))) {
/*
- * The normal path. Keep writing back @bdi until its
+ * The normal path. Keep writing back @wb until its
* work_list is empty. Note that this path is also taken
- * if @bdi is shutting down even when we're running off the
+ * if @wb is shutting down even when we're running off the
* rescuer as work_list needs to be drained.
*/
do {
pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
- } while (!list_empty(&bdi->work_list));
+ } while (!list_empty(&wb->work_list));
} else {
/*
* bdi_wq can't get enough workers and we're running off
* the emergency worker. Don't hog it. Hopefully, 1024 is
* enough for efficient IO.
*/
- pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+ pages_written = writeback_inodes_wb(wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
- if (!list_empty(&bdi->work_list))
+ if (!list_empty(&wb->work_list))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
- bdi_wakeup_thread_delayed(bdi);
+ wb_wakeup_delayed(wb);
current->flags &= ~PF_SWAPWRITE;
}
@@ -1142,9 +1884,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
if (!bdi_has_dirty_io(bdi))
continue;
- __bdi_start_writeback(bdi, nr_pages, false, reason);
+
+ bdi_for_each_wb(wb, bdi, &iter, 0)
+ wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+ false, reason);
}
rcu_read_unlock();
}
@@ -1173,9 +1921,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
- if (list_empty(&bdi->wb.b_dirty_time))
- continue;
- bdi_wakeup_thread(bdi);
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
+ bdi_for_each_wb(wb, bdi, &iter, 0)
+ if (!list_empty(&bdi->wb.b_dirty_time))
+ wb_wakeup(&bdi->wb);
}
rcu_read_unlock();
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1249,7 +2000,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
- struct backing_dev_info *bdi = NULL;
int dirtytime;
trace_writeback_mark_inode_dirty(inode, flags);
@@ -1289,6 +2039,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
+ inode_attach_wb(inode, NULL);
+
if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
@@ -1317,38 +2069,39 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
+ struct bdi_writeback *wb;
+ struct list_head *dirty_list;
bool wakeup_bdi = false;
- bdi = inode_to_bdi(inode);
- spin_unlock(&inode->i_lock);
- spin_lock(&bdi->wb.list_lock);
- if (bdi_cap_writeback_dirty(bdi)) {
- WARN(!test_bit(BDI_registered, &bdi->state),
- "bdi-%s not registered\n", bdi->name);
+ wb = locked_inode_to_wb_and_lock_list(inode);
- /*
- * If this is the first dirty inode for this
- * bdi, we have to wake-up the corresponding
- * bdi thread to make sure background
- * write-back happens later.
- */
- if (!wb_has_dirty_io(&bdi->wb))
- wakeup_bdi = true;
- }
+ WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+ !test_bit(WB_registered, &wb->state),
+ "bdi-%s not registered\n", wb->bdi->name);
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies;
+
if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
- list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ dirty_list = &wb->b_dirty;
else
- list_move(&inode->i_wb_list,
- &bdi->wb.b_dirty_time);
- spin_unlock(&bdi->wb.list_lock);
+ dirty_list = &wb->b_dirty_time;
+
+ wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+ dirty_list);
+
+ spin_unlock(&wb->list_lock);
trace_writeback_dirty_inode_enqueue(inode);
- if (wakeup_bdi)
- bdi_wakeup_thread_delayed(bdi);
+ /*
+ * If this is the first dirty inode for this bdi,
+ * we have to wake-up the corresponding bdi thread
+ * to make sure background write-back happens
+ * later.
+ */
+ if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+ wb_wakeup_delayed(wb);
return;
}
}
@@ -1411,6 +2164,28 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
}
+static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+ enum wb_reason reason, bool skip_if_busy)
+{
+ DEFINE_WB_COMPLETION_ONSTACK(done);
+ struct wb_writeback_work work = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_NONE,
+ .tagged_writepages = 1,
+ .done = &done,
+ .nr_pages = nr,
+ .reason = reason,
+ };
+ struct backing_dev_info *bdi = sb->s_bdi;
+
+ if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+ return;
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+ wb_wait_for_completion(bdi, &done);
+}
+
/**
* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
* @sb: the superblock
@@ -1425,21 +2200,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
unsigned long nr,
enum wb_reason reason)
{
- DECLARE_COMPLETION_ONSTACK(done);
- struct wb_writeback_work work = {
- .sb = sb,
- .sync_mode = WB_SYNC_NONE,
- .tagged_writepages = 1,
- .done = &done,
- .nr_pages = nr,
- .reason = reason,
- };
-
- if (sb->s_bdi == &noop_backing_dev_info)
- return;
- WARN_ON(!rwsem_is_locked(&sb->s_umount));
- bdi_queue_work(sb->s_bdi, &work);
- wait_for_completion(&done);
+ __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);
@@ -1467,19 +2228,15 @@ EXPORT_SYMBOL(writeback_inodes_sb);
* Invoke writeback_inodes_sb_nr if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
-int try_to_writeback_inodes_sb_nr(struct super_block *sb,
- unsigned long nr,
- enum wb_reason reason)
+bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+ enum wb_reason reason)
{
- if (writeback_in_progress(sb->s_bdi))
- return 1;
-
if (!down_read_trylock(&sb->s_umount))
- return 0;
+ return false;
- writeback_inodes_sb_nr(sb, nr, reason);
+ __writeback_inodes_sb_nr(sb, nr, reason, true);
up_read(&sb->s_umount);
- return 1;
+ return true;
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
@@ -1491,7 +2248,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
* Implement by try_to_writeback_inodes_sb_nr()
* Returns 1 if writeback was started, 0 if not.
*/
-int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
@@ -1506,7 +2263,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
*/
void sync_inodes_sb(struct super_block *sb)
{
- DECLARE_COMPLETION_ONSTACK(done);
+ DEFINE_WB_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_ALL,
@@ -1516,14 +2273,15 @@ void sync_inodes_sb(struct super_block *sb)
.reason = WB_REASON_SYNC,
.for_sync = 1,
};
+ struct backing_dev_info *bdi = sb->s_bdi;
/* Nothing to do? */
- if (sb->s_bdi == &noop_backing_dev_info)
+ if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- bdi_queue_work(sb->s_bdi, &work);
- wait_for_completion(&done);
+ bdi_split_work_to_wbs(bdi, &work, false);
+ wb_wait_for_completion(bdi, &done);
wait_sb_inodes(sb);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ef05b5c4cff..8c5e2fa68835 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
list_del(&req->writepages_entry);
for (i = 0; i < req->num_pages; i++) {
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
}
wake_up(&fi->page_waitq);
}
@@ -1634,7 +1634,7 @@ static int fuse_writepage_locked(struct page *page)
req->end = fuse_writepage_end;
req->inode = inode;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
@@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
goto out;
@@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].offset = 0;
req->page_descs[req->num_pages].length = PAGE_SIZE;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c18b49dc5d4f..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL)
gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
- if (bdi->dirty_exceeded)
+ if (bdi->wb.dirty_exceeded)
gfs2_ail1_flush(sdp, wbc);
else
filemap_fdatawrite(metamapping);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 410b65eea683..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 593af2fdcc2d..7302d96ae8bf 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/vfs.h>
diff --git a/fs/inode.c b/fs/inode.c
index 6e342cadef81..a049dc467c1a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
+ inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
locks_free_lock_context(inode->i_flctx);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 1ba5c97943b8..cfbceb116356 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -845,9 +845,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
- /* Might as well let the VFS know */
- d_instantiate(new_dentry, d_inode(old_dentry));
- ihold(d_inode(old_dentry));
+ /*
+ * We can't keep the target in dcache after that.
+ * For one thing, we can't afford dentry aliases for directories.
+ * For another, if there was a victim, we _can't_ set new inode
+ * for that sucker and we have to trigger mount eviction - the
+ * caller won't do it on its own since we are returning an error.
+ */
+ d_invalidate(new_dentry);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index 3e79220babac..ca0244b69de8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,8 @@ alloc_new:
bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
+
+ wbc_init_bio(wbc, bio);
}
/*
@@ -612,6 +614,7 @@ alloc_new:
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
+ wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(WRITE, bio);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 80021c709af9..0c2632386f35 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_rehash(newdent);
} else {
spin_lock(&dentry->d_lock);
- NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
spin_unlock(&dentry->d_lock);
}
} else {
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index fb1fb2774d34..02ec07973bc4 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -32,6 +32,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include <linux/sunrpc/metrics.h>
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9e6475bc5ba2..7e3c4604bea8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page)
struct inode *inode = page_file_mapping(page)->host;
inc_zone_page_state(page, NR_UNSTABLE_NFS);
- inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d9851a6a2813..5051926f6356 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -853,7 +853,8 @@ static void
nfs_clear_page_commit(struct page *page)
{
dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+ dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+ WB_RECLAIMABLE);
}
/* Called holding inode (/cinfo) lock */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc3a9efdaab8..42468e5ab3e7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nilfs_segment_buffer *segbuf = bio->bi_private;
- if (err == -EOPNOTSUPP) {
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- /* to be detected by nilfs_segbuf_submit_bio() */
- }
-
if (!uptodate)
atomic_inc(&segbuf->sb_err);
@@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
- bio_get(bio);
submit_bio(mode, bio);
segbuf->sb_nbio++;
- if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- bio_put(bio);
- err = -EOPNOTSUPP;
- goto failed;
- }
- bio_put(bio);
wi->bio = NULL;
wi->rest_blocks -= wi->end - wi->start;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d8b670cbd909..8f1feca89fb0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -37,6 +37,7 @@
#include <linux/falloc.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <cluster/masklog.h>
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 692ceda3bc21..a2b1d7ce3e1a 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
- d_drop(dentry);
+ if (!err)
+ d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
+ /*
+ * Old dentry now lives in different location. Dentries in
+ * lowerstack are stale. We cannot drop them here because
+ * access to them is lockless. This could be only pure upper
+ * or opaque directory - numlower is zero. Or upper non-dir
+ * entry - its pureness is tracked by flag opaque.
+ */
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a1b069e5e363..e505b44a9184 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -66,6 +66,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
+ if (!err)
+ ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index bd6d5c1e667d..39266655d2bd 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
if (oe->__upperdentry) {
type = __OVL_PATH_UPPER;
- if (oe->numlower) {
- if (S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- } else if (!oe->opaque) {
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location. Its purity depends only on opaque flag.
+ */
+ if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ else if (!oe->opaque)
type |= __OVL_PATH_PURE;
- }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cf6fa25f884b..9a895b415711 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
#include "xattr.h"
#include <linux/init.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/quotaops.h>
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index dc33f9416340..250579a80d90 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -80,6 +80,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a56960dd1684..feb7e3f01e8e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1874,6 +1874,7 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
+ struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1893,6 +1894,11 @@ xfs_vm_set_page_dirty(
offset += 1 << inode->i_blkbits;
} while (bh != head);
}
+ /*
+ * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+ * per-memcg dirty page counters.
+ */
+ memcg = mem_cgroup_begin_page_stat(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1903,13 +1909,15 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
+ mem_cgroup_end_page_stat(memcg);
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 3b7591224f4a..7c62fca53e2f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -41,6 +41,7 @@
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
static const struct vm_operations_struct xfs_file_vm_ops;