Merge branch 'writeback-workqueue' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq into for-3.10/core

Tejun writes: ----- This is the pull request for the earlier patchset[1] with the same name. It's only three patches (the first one was committed to workqueue tree) but the merge strategy is a bit involved due to the dependencies. * Because the conversion needs features from wq/for-3.10, block/for-3.10/core is based on rc3, and wq/for-3.10 has conflicts with rc3, I pulled mainline (rc5) into wq/for-3.10 to prevent those workqueue conflicts from flaring up in block tree. * Resolving the issue that Jan and Dave raised about debugging requires arch-wide changes. The patchset is being worked on[2] but it'll have to go through -mm after these changes show up in -next, and not included in this pull request. The three commits are located in the following git branch. git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git writeback-workqueue Pulling it into block/for-3.10/core produces a conflict in drivers/md/raid5.c between the following two commits. e3620a3ad5 ("MD RAID5: Avoid accessing gendisk or queue structs when not available") 2f6db2a707 ("raid5: use bio_reset()") The conflict is trivial - one removes an "if ()" conditional while the other removes "rbi->bi_next = NULL" right above it. We just need to remove both. The merged branch is available at git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git block-test-merge so that you can use it for verification. The test merge commit has proper merge description. While these changes are a bit of pain to route, they make code simpler and even have, while minute, measureable performance gain[3] even on a workload which isn't particularly favorable to showing the benefits of this conversion. ---- Fixed up the conflict. Conflicts: drivers/md/raid5.c Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Jens Axboe <axboe@kernel.dk> 2013-04-02 10:04:39 +0200
committer: Jens Axboe <axboe@kernel.dk> 2013-04-02 10:04:39 +0200
commit: 64f8de4da7d3962632f152d3d702d68bb8accc29 (patch)
tree: c90a872a6d91c824635d59572e1e578980f4bc98 /mm
parent: f1fb3449efd5c49b48e35746bc7283eb9c73e3a0 (diff)
parent: b5c872ddb7083c7909fb76a170c3807e04564bb3 (diff)
6 files changed, 50 insertions, 250 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41733c5dc82..50251749225 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
 
 /*
- * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
- * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
  * locking.
  */
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
-LIST_HEAD(bdi_pending_list);
+
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
 
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
 {
 	int err;
 
+	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+					      WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!bdi_wq)
+		return -ENOMEM;
+
 	err = bdi_init(&default_backing_dev_info);
 	if (!err)
 		bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
 	return wb_has_dirty_io(&bdi->wb);
 }
 
-static void wakeup_timer_fn(unsigned long data)
-{
-	struct backing_dev_info *bdi = (struct backing_dev_info *)data;
-
-	spin_lock_bh(&bdi->wb_lock);
-	if (bdi->wb.task) {
-		trace_writeback_wake_thread(bdi);
-		wake_up_process(bdi->wb.task);
-	} else if (bdi->dev) {
-		/*
-		 * When bdi tasks are inactive for long time, they are killed.
-		 * In this case we have to wake-up the forker thread which
-		 * should create and run the bdi thread.
-		 */
-		trace_writeback_wake_forker_thread(bdi);
-		wake_up_process(default_backing_dev_info.wb.task);
-	}
-	spin_unlock_bh(&bdi->wb_lock);
-}
-
 /*
  * This function is used when the first inode for this bdi is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
 	unsigned long timeout;
 
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
-}
-
-/*
- * Calculate the longest interval (jiffies) bdi threads are allowed to be
- * inactive.
- */
-static unsigned long bdi_longest_inactive(void)
-{
-	unsigned long interval;
-
-	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
-	return max(5UL * 60 * HZ, interval);
-}
-
-/*
- * Clear pending bit and wakeup anybody waiting for flusher thread creation or
- * shutdown
- */
-static void bdi_clear_pending(struct backing_dev_info *bdi)
-{
-	clear_bit(BDI_pending, &bdi->state);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&bdi->state, BDI_pending);
-}
-
-static int bdi_forker_thread(void *ptr)
-{
-	struct bdi_writeback *me = ptr;
-
-	current->flags |= PF_SWAPWRITE;
-	set_freezable();
-
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(current, 0);
-
-	for (;;) {
-		struct task_struct *task = NULL;
-		struct backing_dev_info *bdi;
-		enum {
-			NO_ACTION,   /* Nothing to do */
-			FORK_THREAD, /* Fork bdi thread */
-			KILL_THREAD, /* Kill inactive bdi thread */
-		} action = NO_ACTION;
-
-		/*
-		 * Temporary measure, we want to make sure we don't see
-		 * dirty data on the default backing_dev_info
-		 */
-		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
-			del_timer(&me->wakeup_timer);
-			wb_do_writeback(me, 0);
-		}
-
-		spin_lock_bh(&bdi_lock);
-		/*
-		 * In the following loop we are going to check whether we have
-		 * some work to do without any synchronization with tasks
-		 * waking us up to do work for them. Set the task state here
-		 * so that we don't miss wakeups after verifying conditions.
-		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		list_for_each_entry(bdi, &bdi_list, bdi_list) {
-			bool have_dirty_io;
-
-			if (!bdi_cap_writeback_dirty(bdi) ||
-			     bdi_cap_flush_forker(bdi))
-				continue;
-
-			WARN(!test_bit(BDI_registered, &bdi->state),
-			     "bdi %p/%s is not registered!\n", bdi, bdi->name);
-
-			have_dirty_io = !list_empty(&bdi->work_list) ||
-					wb_has_dirty_io(&bdi->wb);
-
-			/*
-			 * If the bdi has work to do, but the thread does not
-			 * exist - create it.
-			 */
-			if (!bdi->wb.task && have_dirty_io) {
-				/*
-				 * Set the pending bit - if someone will try to
-				 * unregister this bdi - it'll wait on this bit.
-				 */
-				set_bit(BDI_pending, &bdi->state);
-				action = FORK_THREAD;
-				break;
-			}
-
-			spin_lock(&bdi->wb_lock);
-
-			/*
-			 * If there is no work to do and the bdi thread was
-			 * inactive long enough - kill it. The wb_lock is taken
-			 * to make sure no-one adds more work to this bdi and
-			 * wakes the bdi thread up.
-			 */
-			if (bdi->wb.task && !have_dirty_io &&
-			    time_after(jiffies, bdi->wb.last_active +
-						bdi_longest_inactive())) {
-				task = bdi->wb.task;
-				bdi->wb.task = NULL;
-				spin_unlock(&bdi->wb_lock);
-				set_bit(BDI_pending, &bdi->state);
-				action = KILL_THREAD;
-				break;
-			}
-			spin_unlock(&bdi->wb_lock);
-		}
-		spin_unlock_bh(&bdi_lock);
-
-		/* Keep working if default bdi still has things to do */
-		if (!list_empty(&me->bdi->work_list))
-			__set_current_state(TASK_RUNNING);
-
-		switch (action) {
-		case FORK_THREAD:
-			__set_current_state(TASK_RUNNING);
-			task = kthread_create(bdi_writeback_thread, &bdi->wb,
-					      "flush-%s", dev_name(bdi->dev));
-			if (IS_ERR(task)) {
-				/*
-				 * If thread creation fails, force writeout of
-				 * the bdi from the thread. Hopefully 1024 is
-				 * large enough for efficient IO.
-				 */
-				writeback_inodes_wb(&bdi->wb, 1024,
-						    WB_REASON_FORKER_THREAD);
-			} else {
-				/*
-				 * The spinlock makes sure we do not lose
-				 * wake-ups when racing with 'bdi_queue_work()'.
-				 * And as soon as the bdi thread is visible, we
-				 * can start it.
-				 */
-				spin_lock_bh(&bdi->wb_lock);
-				bdi->wb.task = task;
-				spin_unlock_bh(&bdi->wb_lock);
-				wake_up_process(task);
-			}
-			bdi_clear_pending(bdi);
-			break;
-
-		case KILL_THREAD:
-			__set_current_state(TASK_RUNNING);
-			kthread_stop(task);
-			bdi_clear_pending(bdi);
-			break;
-
-		case NO_ACTION:
-			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
-				/*
-				 * There are no dirty data. The only thing we
-				 * should now care about is checking for
-				 * inactive bdi threads and killing them. Thus,
-				 * let's sleep for longer time, save energy and
-				 * be friendly for battery-driven devices.
-				 */
-				schedule_timeout(bdi_longest_inactive());
-			else
-				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-			try_to_freeze();
-			break;
-		}
-	}
-
-	return 0;
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
 }
 
 /*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
 	spin_unlock_bh(&bdi_lock);
 
 	synchronize_rcu_expedited();
+
+	/* bdi_list is now unused, clear it to mark @bdi dying */
+	INIT_LIST_HEAD(&bdi->bdi_list);
 }
 
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
 	bdi->dev = dev;
 
-	/*
-	 * Just start the forker thread for our default backing_dev_info,
-	 * and add other bdi's to the list. They will get a thread created
-	 * on-demand when they need it.
-	 */
-	if (bdi_cap_flush_forker(bdi)) {
-		struct bdi_writeback *wb = &bdi->wb;
-
-		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
-						dev_name(dev));
-		if (IS_ERR(wb->task))
-			return PTR_ERR(wb->task);
-	}
-
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
 
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
  */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-	struct task_struct *task;
-
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
 
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 	bdi_remove_from_list(bdi);
 
 	/*
-	 * If setup is pending, wait for that to complete first
+	 * Drain work list and shutdown the delayed_work.  At this point,
+	 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+	 * is dying and its work_list needs to be drained no matter what.
 	 */
-	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
-			TASK_UNINTERRUPTIBLE);
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+	flush_delayed_work(&bdi->wb.dwork);
+	WARN_ON(!list_empty(&bdi->work_list));
 
 	/*
-	 * Finally, kill the kernel thread. We don't need to be RCU
-	 * safe anymore, since the bdi is gone from visibility.
+	 * This shouldn't be necessary unless @bdi for some reason has
+	 * unflushed dirty IO after work_list is drained.  Do it anyway
+	 * just in case.
 	 */
-	spin_lock_bh(&bdi->wb_lock);
-	task = bdi->wb.task;
-	bdi->wb.task = NULL;
-	spin_unlock_bh(&bdi->wb_lock);
-
-	if (task)
-		kthread_stop(task);
+	cancel_delayed_work_sync(&bdi->wb.dwork);
 }
 
 /*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
 		bdi_set_min_ratio(bdi, 0);
 		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
-		del_timer_sync(&bdi->wb.wakeup_timer);
 
-		if (!bdi_cap_flush_forker(bdi))
-			bdi_wb_shutdown(bdi);
+		bdi_wb_shutdown(bdi);
 		bdi_debug_unregister(bdi);
 
 		spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
 	spin_lock_init(&wb->list_lock);
-	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
 
 /*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	bdi_unregister(bdi);
 
 	/*
-	 * If bdi_unregister() had already been called earlier, the
-	 * wakeup_timer could still be armed because bdi_prune_sb()
-	 * can race with the bdi_wakeup_thread_delayed() calls from
-	 * __mark_inode_dirty().
+	 * If bdi_unregister() had already been called earlier, the dwork
+	 * could still be pending because bdi_prune_sb() can race with the
+	 * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
 	 */
-	del_timer_sync(&bdi->wb.wakeup_timer);
+	cancel_delayed_work_sync(&bdi->wb.dwork);
 
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/fremap.c b/mm/fremap.c
index 4723ac8d2fc..87da3590c61 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -204,10 +204,8 @@ get_write_lock:
 			unsigned long addr;
 			struct file *file = get_file(vma->vm_file);
 
-			vm_flags = vma->vm_flags;
-			if (!(flags & MAP_NONBLOCK))
-				vm_flags |= VM_POPULATE;
-			addr = mmap_region(file, start, size, vm_flags, pgoff);
+			addr = mmap_region(file, start, size,
+					vma->vm_flags, pgoff);
 			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
@@ -226,12 +224,6 @@ get_write_lock:
 		mutex_unlock(&mapping->i_mmap_mutex);
 	}
 
-	if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
-		if (!has_write_lock)
-			goto get_write_lock;
-		vma->vm_flags |= VM_POPULATE;
-	}
-
 	if (vma->vm_flags & VM_LOCKED) {
 		/*
 		 * drop PG_Mlocked flag for over-mapped range
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a0be33bb19..ca9a7c6d7e9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2124,8 +2124,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-	struct hstate *h = &default_hstate;
-	return h->nr_huge_pages * pages_per_huge_page(h);
+	struct hstate *h;
+	unsigned long nr_total_pages = 0;
+
+	for_each_hstate(h)
+		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+	return nr_total_pages;
 }
 
 static int hugetlb_acct_memory(struct hstate *h, long delta)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9597eec8239..ee376576081 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1779,7 +1779,11 @@ void try_offline_node(int nid)
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 
-		if (zone->wait_table)
+		/*
+		 * wait_table may be allocated from boot memory,
+		 * here only free if it's allocated by vmalloc.
+		 */
+		if (is_vmalloc_addr(zone->wait_table))
 			vfree(zone->wait_table);
 	}
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 1c5e33fce63..79b7cf7d1bc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -358,7 +358,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 
 		newflags = vma->vm_flags & ~VM_LOCKED;
 		if (on)
-			newflags |= VM_LOCKED | VM_POPULATE;
+			newflags |= VM_LOCKED;
 
 		tmp = vma->vm_end;
 		if (tmp > end)
@@ -418,8 +418,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 		 * range with the first VMA. Also, skip undesirable VMA types.
 		 */
 		nend = min(end, vma->vm_end);
-		if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
-		    VM_POPULATE)
+		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 			continue;
 		if (nstart < vma->vm_start)
 			nstart = vma->vm_start;
@@ -492,9 +491,9 @@ static int do_mlockall(int flags)
 	struct vm_area_struct * vma, * prev = NULL;
 
 	if (flags & MCL_FUTURE)
-		current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
+		current->mm->def_flags |= VM_LOCKED;
 	else
-		current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
+		current->mm->def_flags &= ~VM_LOCKED;
 	if (flags == MCL_FUTURE)
 		goto out;
 
@@ -503,7 +502,7 @@ static int do_mlockall(int flags)
 
 		newflags = vma->vm_flags & ~VM_LOCKED;
 		if (flags & MCL_CURRENT)
-			newflags |= VM_LOCKED | VM_POPULATE;
+			newflags |= VM_LOCKED;
 
 		/* Ignore errors */
 		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 2664a47cec9..6466699b16c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1306,7 +1306,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	}
 
 	addr = mmap_region(file, addr, len, vm_flags, pgoff);
-	if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
+	if (!IS_ERR_VALUE(addr) &&
+	    ((vm_flags & VM_LOCKED) ||
+	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
 		*populate = len;
 	return addr;
 }
author	Jens Axboe <axboe@kernel.dk>	2013-04-02 10:04:39 +0200
committer	Jens Axboe <axboe@kernel.dk>	2013-04-02 10:04:39 +0200
commit	64f8de4da7d3962632f152d3d702d68bb8accc29 (patch)
tree	c90a872a6d91c824635d59572e1e578980f4bc98 /mm
parent	f1fb3449efd5c49b48e35746bc7283eb9c73e3a0 (diff)
parent	b5c872ddb7083c7909fb76a170c3807e04564bb3 (diff)