From 569b846df54ffb2827b83ce3244c5f032394cba4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 15 Dec 2009 16:47:03 -0800 Subject: memcg: coalesce uncharge during unmap/truncate In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 90 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b5b108c1c6..a730c91b8e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) css_put(&mem->css); } +static void +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +{ + struct memcg_batch_info *batch = NULL; + bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ + if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) + uncharge_memsw = false; + /* + * do_batch > 0 when unmapping pages or inode invalidate/truncate. + * In those cases, all pages freed continously can be expected to be in + * the same cgroup and we have chance to coalesce uncharges. + * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) + * because we want to do uncharge as soon as possible. + */ + if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) + goto direct_uncharge; + + batch = ¤t->memcg_batch; + /* + * In usual, we do css_get() when we remember memcg pointer. + * But in this case, we keep res->usage until end of a series of + * uncharges. Then, it's ok to ignore memcg's refcnt. + */ + if (!batch->memcg) + batch->memcg = mem; + /* + * In typical case, batch->memcg == mem. This means we can + * merge a series of uncharges to an uncharge of res_counter. + * If not, we uncharge res_counter ony by one. + */ + if (batch->memcg != mem) + goto direct_uncharge; + /* remember freed charge and uncharge it later */ + batch->bytes += PAGE_SIZE; + if (uncharge_memsw) + batch->memsw_bytes += PAGE_SIZE; + return; +direct_uncharge: + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (uncharge_memsw) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + return; +} /* * uncharge if !page_mapped(page) @@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account && - (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - } + if (!mem_cgroup_is_root(mem)) + __do_uncharge(mem, ctype); if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); @@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +/* + * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. + * In that cases, pages are freed continuously and we can expect pages + * are in the same memcg. All these calls itself limits the number of + * pages freed at once, then uncharge_start/end() is called properly. + * This may be called prural(2) times in a context, + */ + +void mem_cgroup_uncharge_start(void) +{ + current->memcg_batch.do_batch++; + /* We can do nest. */ + if (current->memcg_batch.do_batch == 1) { + current->memcg_batch.memcg = NULL; + current->memcg_batch.bytes = 0; + current->memcg_batch.memsw_bytes = 0; + } +} + +void mem_cgroup_uncharge_end(void) +{ + struct memcg_batch_info *batch = ¤t->memcg_batch; + + if (!batch->do_batch) + return; + + batch->do_batch--; + if (batch->do_batch) /* If stacked, do nothing. */ + return; + + if (!batch->memcg) + return; + /* + * This "batch->memcg" is valid without any css_get/put etc... + * bacause we hide charges behind us. + */ + if (batch->bytes) + res_counter_uncharge(&batch->memcg->res, batch->bytes); + if (batch->memsw_bytes) + res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + /* forget this pointer (for sanity check) */ + batch->memcg = NULL; +} + #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. -- cgit v1.2.3