aboutsummaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c86
1 files changed, 63 insertions, 23 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e1a95dbcd5f8..763d2fd3cadd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
+
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+ struct mem_cgroup *memcg = sc->target_mem_cgroup;
+
+ if (!memcg)
+ return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ if (memcg->css.cgroup)
+ return true;
+#endif
+ return false;
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
+
+static bool sane_reclaim(struct scan_control *sc)
+{
+ return true;
+}
#endif
static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi,
- struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
- if (!bdi_write_congested(bdi))
+ if (!inode_write_congested(inode))
return 1;
- if (bdi == current->backing_dev_info)
+ if (inode_to_bdi(inode) == current->backing_dev_info)
return 1;
return 0;
}
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+ if (!may_write_to_inode(mapping->host, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
+ unsigned long flags;
+ struct mem_cgroup *memcg;
+
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- spin_lock_irq(&mapping->tree_lock);
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
*
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow);
- spin_unlock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page, shadow, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(inode_to_bdi(mapping->host))) ||
+ inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
nr_congested++;
@@ -935,11 +972,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* note that the LRU is being scanned too quickly and the
* caller can stall after page list has been processed.
*
- * 2) Global reclaim encounters a page, memcg encounters a
- * page that is not marked for immediate reclaim or
- * the caller does not have __GFP_FS (or __GFP_IO if it's
- * simply going to swap, not to fs). In this case mark
- * the page for immediate reclaim and continue scanning.
+ * 2) Global or new memcg reclaim encounters a page that is
+ * not marked for immediate reclaim, or the caller does not
+ * have __GFP_FS (or __GFP_IO if it's simply going to swap,
+ * not to fs). In this case mark the page for immediate
+ * reclaim and continue scanning.
*
* Require may_enter_fs because we would wait on fs, which
* may not have submitted IO yet. And the loop driver might
@@ -948,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) memcg encounters a page that is not already marked
+ * 3) Legacy memcg encounters a page that is already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
@@ -963,7 +1000,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Case 2 above */
- } else if (global_reclaim(sc) ||
+ } else if (sane_reclaim(sc) ||
!PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
@@ -978,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
SetPageReclaim(page);
nr_writeback++;
-
goto keep_locked;
/* Case 3 above */
} else {
+ unlock_page(page);
wait_on_page_writeback(page);
+ /* then go back and try same page again */
+ list_add_tail(&page->lru, page_list);
+ continue;
}
}
@@ -1412,7 +1452,7 @@ static int too_many_isolated(struct zone *zone, int file,
if (current_is_kswapd())
return 0;
- if (!global_reclaim(sc))
+ if (!sane_reclaim(sc))
return 0;
if (file) {
@@ -1604,10 +1644,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
set_bit(ZONE_WRITEBACK, &zone->flags);
/*
- * memcg will stall in page writeback so only consider forcibly
- * stalling for global reclaim
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling here.
*/
- if (global_reclaim(sc)) {
+ if (sane_reclaim(sc)) {
/*
* Tag a zone as congested if all the dirty pages scanned were
* backed by a congested BDI and wait_iff_congested will stall.