From 34f8fe501f0624de115d087680c84000b5d9abc9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:53 -0700 Subject: bdi: Add bdi->id There currently is no way to universally identify and lookup a bdi without holding a reference and pointer to it. This patch adds an non-recycling bdi->id and implements bdi_get_by_id() which looks up bdis by their ids. This will be used by memcg foreign inode flushing. I left bdi_list alone for simplicity and because while rb_tree does support rcu assignment it doesn't seem to guarantee lossless walk when walk is racing aginst tree rebalance operations. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- mm/backing-dev.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e8e89158adec..612aa7c5ddbd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -22,10 +23,12 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side - * locking. + * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU + * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); +static u64 bdi_id_cursor; +static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ @@ -859,9 +862,58 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } EXPORT_SYMBOL(bdi_alloc_node); +static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) +{ + struct rb_node **p = &bdi_tree.rb_node; + struct rb_node *parent = NULL; + struct backing_dev_info *bdi; + + lockdep_assert_held(&bdi_lock); + + while (*p) { + parent = *p; + bdi = rb_entry(parent, struct backing_dev_info, rb_node); + + if (bdi->id > id) + p = &(*p)->rb_left; + else if (bdi->id < id) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * bdi_get_by_id - lookup and get bdi from its id + * @id: bdi id to lookup + * + * Find bdi matching @id and get it. Returns NULL if the matching bdi + * doesn't exist or is already unregistered. + */ +struct backing_dev_info *bdi_get_by_id(u64 id) +{ + struct backing_dev_info *bdi = NULL; + struct rb_node **p; + + spin_lock_bh(&bdi_lock); + p = bdi_lookup_rb_node(id, NULL); + if (*p) { + bdi = rb_entry(*p, struct backing_dev_info, rb_node); + bdi_get(bdi); + } + spin_unlock_bh(&bdi_lock); + + return bdi; +} + int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; + struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; @@ -877,7 +929,15 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); + + bdi->id = ++bdi_id_cursor; + + p = bdi_lookup_rb_node(bdi->id, &parent); + rb_link_node(&bdi->rb_node, parent, p); + rb_insert_color(&bdi->rb_node, &bdi_tree); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); @@ -918,6 +978,7 @@ EXPORT_SYMBOL(bdi_register_owner); static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); + rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); -- cgit v1.2.3 From ed288dc0d4aa29f65bd25b31b5cb866aa5664ff9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:54 -0700 Subject: writeback: Separate out wb_get_lookup() from wb_get_create() Separate out wb_get_lookup() which doesn't try to create one if there isn't already one from wb_get_create(). This will be used by later patches. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- mm/backing-dev.c | 55 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 612aa7c5ddbd..d9daa3e422d0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -618,13 +618,12 @@ out_put: } /** - * wb_get_create - get wb for a given memcg, create if necessary + * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) - * @gfp: allocation mask to use * - * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to - * create one. The returned wb has its refcount incremented. + * Try to get the wb for @memcg_css on @bdi. The returned wb has its + * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on @@ -641,6 +640,39 @@ out_put: * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ +struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css) +{ + struct bdi_writeback *wb; + + if (!memcg_css->parent) + return &bdi->wb; + + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + + return wb; +} + +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. See wb_get_lookup() for more details. + */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) @@ -653,20 +685,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, return &bdi->wb; do { - rcu_read_lock(); - wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); - if (wb) { - struct cgroup_subsys_state *blkcg_css; - - /* see whether the blkcg association has changed */ - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, - &io_cgrp_subsys); - if (unlikely(wb->blkcg_css != blkcg_css || - !wb_tryget(wb))) - wb = NULL; - css_put(blkcg_css); - } - rcu_read_unlock(); + wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; -- cgit v1.2.3 From 97b27821b4854ca744946dae32a3f2fd55bcd5bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:56 -0700 Subject: writeback, memcg: Implement foreign dirty flushing There's an inherent mismatch between memcg and writeback. The former trackes ownership per-page while the latter per-inode. This was a deliberate design decision because honoring per-page ownership in the writeback path is complicated, may lead to higher CPU and IO overheads and deemed unnecessary given that write-sharing an inode across different cgroups isn't a common use-case. Combined with inode majority-writer ownership switching, this works well enough in most cases but there are some pathological cases. For example, let's say there are two cgroups A and B which keep writing to different but confined parts of the same inode. B owns the inode and A's memory is limited far below B's. A's dirty ratio can rise enough to trigger balance_dirty_pages() sleeps but B's can be low enough to avoid triggering background writeback. A will be slowed down without a way to make writeback of the dirty pages happen. This patch implements foreign dirty recording and foreign mechanism so that when a memcg encounters a condition as above it can trigger flushes on bdi_writebacks which can clean its pages. Please see the comment on top of mem_cgroup_track_foreign_dirty_slowpath() for details. A reproducer follows. write-range.c:: #include #include #include #include #include static const char *usage = "write-range FILE START SIZE\n"; int main(int argc, char **argv) { int fd; unsigned long start, size, end, pos; char *endp; char buf[4096]; if (argc < 4) { fprintf(stderr, usage); return 1; } fd = open(argv[1], O_WRONLY); if (fd < 0) { perror("open"); return 1; } start = strtoul(argv[2], &endp, 0); if (*endp != '\0') { fprintf(stderr, usage); return 1; } size = strtoul(argv[3], &endp, 0); if (*endp != '\0') { fprintf(stderr, usage); return 1; } end = start + size; while (1) { for (pos = start; pos < end; ) { long bread, bwritten = 0; if (lseek(fd, pos, SEEK_SET) < 0) { perror("lseek"); return 1; } bread = read(0, buf, sizeof(buf) < end - pos ? sizeof(buf) : end - pos); if (bread < 0) { perror("read"); return 1; } if (bread == 0) return 0; while (bwritten < bread) { long this; this = write(fd, buf + bwritten, bread - bwritten); if (this < 0) { perror("write"); return 1; } bwritten += this; pos += bwritten; } } } } repro.sh:: #!/bin/bash set -e set -x sysctl -w vm.dirty_expire_centisecs=300000 sysctl -w vm.dirty_writeback_centisecs=300000 sysctl -w vm.dirtytime_expire_seconds=300000 echo 3 > /proc/sys/vm/drop_caches TEST=/sys/fs/cgroup/test A=$TEST/A B=$TEST/B mkdir -p $A $B echo "+memory +io" > $TEST/cgroup.subtree_control echo $((1<<30)) > $A/memory.high echo $((32<<30)) > $B/memory.high rm -f testfile touch testfile fallocate -l 4G testfile echo "Starting B" (echo $BASHPID > $B/cgroup.procs pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) & echo "Waiting 10s to ensure B claims the testfile inode" sleep 5 sync sleep 5 sync echo "Starting A" (echo $BASHPID > $A/cgroup.procs pv < /dev/urandom | ./write-range testfile 0 $((2<<30))) v2: Added comments explaining why the specific intervals are being used. v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort flushing while avoding possible livelocks. v4: Use get_jiffies_64() and time_before/after64() instead of raw jiffies_64 and arthimetic comparisons as suggested by Jan. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- mm/memcontrol.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 4 ++ 2 files changed, 138 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cdbb7a84cb6e..89b65f5ca634 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,6 +87,10 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +#ifdef CONFIG_CGROUP_WRITEBACK +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); +#endif + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -4145,6 +4149,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, } } +/* + * Foreign dirty flushing + * + * There's an inherent mismatch between memcg and writeback. The former + * trackes ownership per-page while the latter per-inode. This was a + * deliberate design decision because honoring per-page ownership in the + * writeback path is complicated, may lead to higher CPU and IO overheads + * and deemed unnecessary given that write-sharing an inode across + * different cgroups isn't a common use-case. + * + * Combined with inode majority-writer ownership switching, this works well + * enough in most cases but there are some pathological cases. For + * example, let's say there are two cgroups A and B which keep writing to + * different but confined parts of the same inode. B owns the inode and + * A's memory is limited far below B's. A's dirty ratio can rise enough to + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid + * triggering background writeback. A will be slowed down without a way to + * make writeback of the dirty pages happen. + * + * Conditions like the above can lead to a cgroup getting repatedly and + * severely throttled after making some progress after each + * dirty_expire_interval while the underyling IO device is almost + * completely idle. + * + * Solving this problem completely requires matching the ownership tracking + * granularities between memcg and writeback in either direction. However, + * the more egregious behaviors can be avoided by simply remembering the + * most recent foreign dirtying events and initiating remote flushes on + * them when local writeback isn't enough to keep the memory clean enough. + * + * The following two functions implement such mechanism. When a foreign + * page - a page whose memcg and writeback ownerships don't match - is + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning + * bdi_writeback on the page owning memcg. When balance_dirty_pages() + * decides that the memcg needs to sleep due to high dirty ratio, it calls + * mem_cgroup_flush_foreign() which queues writeback on the recorded + * foreign bdi_writebacks which haven't expired. Both the numbers of + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are + * limited to MEMCG_CGWB_FRN_CNT. + * + * The mechanism only remembers IDs and doesn't hold any object references. + * As being wrong occasionally doesn't matter, updates and accesses to the + * records are lockless and racy. + */ +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, + struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + struct memcg_cgwb_frn *frn; + u64 now = get_jiffies_64(); + u64 oldest_at = now; + int oldest = -1; + int i; + + /* + * Pick the slot to use. If there is already a slot for @wb, keep + * using it. If not replace the oldest one which isn't being + * written out. + */ + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + frn = &memcg->cgwb_frn[i]; + if (frn->bdi_id == wb->bdi->id && + frn->memcg_id == wb->memcg_css->id) + break; + if (time_before64(frn->at, oldest_at) && + atomic_read(&frn->done.cnt) == 1) { + oldest = i; + oldest_at = frn->at; + } + } + + if (i < MEMCG_CGWB_FRN_CNT) { + /* + * Re-using an existing one. Update timestamp lazily to + * avoid making the cacheline hot. We want them to be + * reasonably up-to-date and significantly shorter than + * dirty_expire_interval as that's what expires the record. + * Use the shorter of 1s and dirty_expire_interval / 8. + */ + unsigned long update_intv = + min_t(unsigned long, HZ, + msecs_to_jiffies(dirty_expire_interval * 10) / 8); + + if (time_before64(frn->at, now - update_intv)) + frn->at = now; + } else if (oldest >= 0) { + /* replace the oldest free one */ + frn = &memcg->cgwb_frn[oldest]; + frn->bdi_id = wb->bdi->id; + frn->memcg_id = wb->memcg_css->id; + frn->at = now; + } +} + +/* issue foreign writeback flushes for recorded foreign dirtying events */ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); + u64 now = jiffies_64; + int i; + + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; + + /* + * If the record is older than dirty_expire_interval, + * writeback on it has already started. No need to kick it + * off again. Also, don't start a new one if there's + * already one in flight. + */ + if (time_after64(frn->at, now - intv) && + atomic_read(&frn->done.cnt) == 1) { + frn->at = 0; + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + WB_REASON_FOREIGN_FLUSH, + &frn->done); + } + } +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) @@ -4661,6 +4786,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) struct mem_cgroup *memcg; unsigned int size; int node; + int __maybe_unused i; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); @@ -4704,6 +4830,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + memcg->cgwb_frn[i].done = + __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; @@ -4833,7 +4962,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + int __maybe_unused i; +#ifdef CONFIG_CGROUP_WRITEBACK + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + wb_wait_for_completion(&memcg->cgwb_frn[i].done); +#endif if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1804f64ff43c..50055d2e4ea8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); + mem_cgroup_flush_foreign(wb); + /* * Calculate global domain's pos_ratio and select the * global dtc by default. @@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); + + mem_cgroup_track_foreign_dirty(page, wb); } } -- cgit v1.2.3 From 3a8e9ac89e6a5106cfb6b85d4c9cf9bfa3519bc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Aug 2019 15:47:19 -0700 Subject: writeback: add tracepoints for cgroup foreign writebacks cgroup foreign inode handling has quite a bit of heuristics and internal states which sometimes makes it difficult to understand what's going on. Add tracepoints to improve visibility. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- mm/memcontrol.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 89b65f5ca634..4390994e8be9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4066,6 +4066,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK +#include + static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); @@ -4203,6 +4205,8 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, int oldest = -1; int i; + trace_track_foreign_dirty(page, wb); + /* * Pick the slot to use. If there is already a slot for @wb, keep * using it. If not replace the oldest one which isn't being @@ -4263,6 +4267,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) if (time_after64(frn->at, now - intv) && atomic_read(&frn->done.cnt) == 1) { frn->at = 0; + trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, WB_REASON_FOREIGN_FLUSH, &frn->done); -- cgit v1.2.3