Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs changes from Chris Mason: "This is a pretty long stream of bug fixes and performance fixes. Qu Wenruo has replaced the btrfs async threads with regular kernel workqueues. We'll keep an eye out for performance differences, but it's nice to be using more generic code for this. We still have some corruption fixes and other patches coming in for the merge window, but this batch is tested and ready to go" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (108 commits) Btrfs: fix a crash of clone with inline extents's split btrfs: fix uninit variable warning Btrfs: take into account total references when doing backref lookup Btrfs: part 2, fix incremental send's decision to delay a dir move/rename Btrfs: fix incremental send's decision to delay a dir move/rename Btrfs: remove unnecessary inode generation lookup in send Btrfs: fix race when updating existing ref head btrfs: Add trace for btrfs_workqueue alloc/destroy Btrfs: less fs tree lock contention when using autodefrag Btrfs: return EPERM when deleting a default subvolume Btrfs: add missing kfree in btrfs_destroy_workqueue Btrfs: cache extent states in defrag code path Btrfs: fix deadlock with nested trans handles Btrfs: fix possible empty list access when flushing the delalloc inodes Btrfs: split the global ordered extents mutex Btrfs: don't flush all delalloc inodes when we doesn't get s_umount lock Btrfs: reclaim delalloc metadata more aggressively Btrfs: remove unnecessary lock in may_commit_transaction() Btrfs: remove the unnecessary flush when preparing the pages Btrfs: just do dirty page flush for the inode with compression before direct IO ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-04 15:31:36 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-04 15:31:36 -0700
commit: 53c566625fb872e7826a237f0f5c21458028e94a (patch)
tree: 8ef9990ed2124f085442bc5a44c3f5212bf4002d
parent: 34917f9713905a937816ebb7ee5f25bef7a6441c (diff)
parent: 00fdf13a2e9f313a044288aa59d3b8ec29ff904a (diff)
35 files changed, 2137 insertions, 1626 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..ecb5832c0967 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -21,708 +22,313 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
+#include <linux/workqueue.h>
 #include "async-thread.h"
+#include "ctree.h"
+
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct __btrfs_workqueue {
+	struct workqueue_struct *normal_wq;
+	/* List head pointing to ordered work list */
+	struct list_head ordered_list;
+
+	/* Spinlock for ordered_list */
+	spinlock_t list_lock;
+
+	/* Thresholding related variants */
+	atomic_t pending;
+	int max_active;
+	int current_max;
+	int thresh;
+	unsigned int count;
+	spinlock_t thres_lock;
+};
 
-#define WORK_QUEUED_BIT 0
-#define WORK_DONE_BIT 1
-#define WORK_ORDER_DONE_BIT 2
-#define WORK_HIGH_PRIO_BIT 3
-
-/*
- * container for the kthread task pointer and the list of pending work
- * One of these is allocated per thread.
- */
-struct btrfs_worker_thread {
-	/* pool we belong to */
-	struct btrfs_workers *workers;
-
-	/* list of struct btrfs_work that are waiting for service */
-	struct list_head pending;
-	struct list_head prio_pending;
-
-	/* list of worker threads from struct btrfs_workers */
-	struct list_head worker_list;
-
-	/* kthread */
-	struct task_struct *task;
+struct btrfs_workqueue {
+	struct __btrfs_workqueue *normal;
+	struct __btrfs_workqueue *high;
+};
 
-	/* number of things on the pending list */
-	atomic_t num_pending;
+static inline struct __btrfs_workqueue
+*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+			 int thresh)
+{
+	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
-	/* reference counter for this struct */
-	atomic_t refs;
+	if (unlikely(!ret))
+		return NULL;
 
-	unsigned long sequence;
+	ret->max_active = max_active;
+	atomic_set(&ret->pending, 0);
+	if (thresh == 0)
+		thresh = DFT_THRESHOLD;
+	/* For low threshold, disabling threshold is a better choice */
+	if (thresh < DFT_THRESHOLD) {
+		ret->current_max = max_active;
+		ret->thresh = NO_THRESHOLD;
+	} else {
+		ret->current_max = 1;
+		ret->thresh = thresh;
+	}
 
-	/* protects the pending list. */
-	spinlock_t lock;
+	if (flags & WQ_HIGHPRI)
+		ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+						 ret->max_active,
+						 "btrfs", name);
+	else
+		ret->normal_wq = alloc_workqueue("%s-%s", flags,
+						 ret->max_active, "btrfs",
+						 name);
+	if (unlikely(!ret->normal_wq)) {
+		kfree(ret);
+		return NULL;
+	}
 
-	/* set to non-zero when this thread is already awake and kicking */
-	int working;
+	INIT_LIST_HEAD(&ret->ordered_list);
+	spin_lock_init(&ret->list_lock);
+	spin_lock_init(&ret->thres_lock);
+	trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+	return ret;
+}
 
-	/* are we currently idle */
-	int idle;
-};
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 
-static int __btrfs_start_workers(struct btrfs_workers *workers);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+					      int flags,
+					      int max_active,
+					      int thresh)
+{
+	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
-/*
- * btrfs_start_workers uses kthread_run, which can block waiting for memory
- * for a very long time.  It will actually throttle on page writeback,
- * and so it may not make progress until after our btrfs worker threads
- * process all of the pending work structs in their queue
- *
- * This means we can't use btrfs_start_workers from inside a btrfs worker
- * thread that is used as part of cleaning dirty memory, which pretty much
- * involves all of the worker threads.
- *
- * Instead we have a helper queue who never has more than one thread
- * where we scheduler thread start operations.  This worker_start struct
- * is used to contain the work and hold a pointer to the queue that needs
- * another worker.
- */
-struct worker_start {
-	struct btrfs_work work;
-	struct btrfs_workers *queue;
-};
+	if (unlikely(!ret))
+		return NULL;
 
-static void start_new_worker_func(struct btrfs_work *work)
-{
-	struct worker_start *start;
-	start = container_of(work, struct worker_start, work);
-	__btrfs_start_workers(start->queue);
-	kfree(start);
-}
+	ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+					      max_active, thresh);
+	if (unlikely(!ret->normal)) {
+		kfree(ret);
+		return NULL;
+	}
 
-/*
- * helper function to move a thread onto the idle list after it
- * has finished some requests.
- */
-static void check_idle_worker(struct btrfs_worker_thread *worker)
-{
-	if (!worker->idle && atomic_read(&worker->num_pending) <
-	    worker->workers->idle_thresh / 2) {
-		unsigned long flags;
-		spin_lock_irqsave(&worker->workers->lock, flags);
-		worker->idle = 1;
-
-		/* the list may be empty if the worker is just starting */
-		if (!list_empty(&worker->worker_list) &&
-		    !worker->workers->stopping) {
-			list_move(&worker->worker_list,
-				 &worker->workers->idle_list);
+	if (flags & WQ_HIGHPRI) {
+		ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+						    thresh);
+		if (unlikely(!ret->high)) {
+			__btrfs_destroy_workqueue(ret->normal);
+			kfree(ret);
+			return NULL;
 		}
-		spin_unlock_irqrestore(&worker->workers->lock, flags);
 	}
+	return ret;
 }
 
 /*
- * helper function to move a thread off the idle list after new
- * pending work is added.
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
  */
-static void check_busy_worker(struct btrfs_worker_thread *worker)
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
 {
-	if (worker->idle && atomic_read(&worker->num_pending) >=
-	    worker->workers->idle_thresh) {
-		unsigned long flags;
-		spin_lock_irqsave(&worker->workers->lock, flags);
-		worker->idle = 0;
-
-		if (!list_empty(&worker->worker_list) &&
-		    !worker->workers->stopping) {
-			list_move_tail(&worker->worker_list,
-				      &worker->workers->worker_list);
-		}
-		spin_unlock_irqrestore(&worker->workers->lock, flags);
-	}
+	if (wq->thresh == NO_THRESHOLD)
+		return;
+	atomic_inc(&wq->pending);
 }
 
-static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
 {
-	struct btrfs_workers *workers = worker->workers;
-	struct worker_start *start;
-	unsigned long flags;
+	int new_max_active;
+	long pending;
+	int need_change = 0;
 
-	rmb();
-	if (!workers->atomic_start_pending)
+	if (wq->thresh == NO_THRESHOLD)
 		return;
 
-	start = kzalloc(sizeof(*start), GFP_NOFS);
-	if (!start)
-		return;
-
-	start->work.func = start_new_worker_func;
-	start->queue = workers;
-
-	spin_lock_irqsave(&workers->lock, flags);
-	if (!workers->atomic_start_pending)
-		goto out;
-
-	workers->atomic_start_pending = 0;
-	if (workers->num_workers + workers->num_workers_starting >=
-	    workers->max_workers)
-		goto out;
-
-	workers->num_workers_starting += 1;
-	spin_unlock_irqrestore(&workers->lock, flags);
-	btrfs_queue_worker(workers->atomic_worker_start, &start->work);
-	return;
+	atomic_dec(&wq->pending);
+	spin_lock(&wq->thres_lock);
+	/*
+	 * Use wq->count to limit the calling frequency of
+	 * workqueue_set_max_active.
+	 */
+	wq->count++;
+	wq->count %= (wq->thresh / 4);
+	if (!wq->count)
+		goto  out;
+	new_max_active = wq->current_max;
 
+	/*
+	 * pending may be changed later, but it's OK since we really
+	 * don't need it so accurate to calculate new_max_active.
+	 */
+	pending = atomic_read(&wq->pending);
+	if (pending > wq->thresh)
+		new_max_active++;
+	if (pending < wq->thresh / 2)
+		new_max_active--;
+	new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+	if (new_max_active != wq->current_max)  {
+		need_change = 1;
+		wq->current_max = new_max_active;
+	}
 out:
-	kfree(start);
-	spin_unlock_irqrestore(&workers->lock, flags);
+	spin_unlock(&wq->thres_lock);
+
+	if (need_change) {
+		workqueue_set_max_active(wq->normal_wq, wq->current_max);
+	}
 }
 
-static noinline void run_ordered_completions(struct btrfs_workers *workers,
-					    struct btrfs_work *work)
+static void run_ordered_work(struct __btrfs_workqueue *wq)
 {
-	if (!workers->ordered)
-		return;
-
-	set_bit(WORK_DONE_BIT, &work->flags);
-
-	spin_lock(&workers->order_lock);
+	struct list_head *list = &wq->ordered_list;
+	struct btrfs_work *work;
+	spinlock_t *lock = &wq->list_lock;
+	unsigned long flags;
 
 	while (1) {
-		if (!list_empty(&workers->prio_order_list)) {
-			work = list_entry(workers->prio_order_list.next,
-					  struct btrfs_work, order_list);
-		} else if (!list_empty(&workers->order_list)) {
-			work = list_entry(workers->order_list.next,
-					  struct btrfs_work, order_list);
-		} else {
+		spin_lock_irqsave(lock, flags);
+		if (list_empty(list))
 			break;
-		}
+		work = list_entry(list->next, struct btrfs_work,
+				  ordered_list);
 		if (!test_bit(WORK_DONE_BIT, &work->flags))
 			break;
 
-		/* we are going to call the ordered done function, but
+		/*
+		 * we are going to call the ordered done function, but
 		 * we leave the work item on the list as a barrier so
 		 * that later work items that are done don't have their
 		 * functions called before this one returns
 		 */
 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
 			break;
-
-		spin_unlock(&workers->order_lock);
-
+		trace_btrfs_ordered_sched(work);
+		spin_unlock_irqrestore(lock, flags);
 		work->ordered_func(work);
 
 		/* now take the lock again and drop our item from the list */
-		spin_lock(&workers->order_lock);
-		list_del(&work->order_list);
-		spin_unlock(&workers->order_lock);
+		spin_lock_irqsave(lock, flags);
+		list_del(&work->ordered_list);
+		spin_unlock_irqrestore(lock, flags);
 
 		/*
 		 * we don't want to call the ordered free functions
 		 * with the lock held though
 		 */
 		work->ordered_free(work);
-		spin_lock(&workers->order_lock);
-	}
-
-	spin_unlock(&workers->order_lock);
-}
-
-static void put_worker(struct btrfs_worker_thread *worker)
-{
-	if (atomic_dec_and_test(&worker->refs))
-		kfree(worker);
-}
-
-static int try_worker_shutdown(struct btrfs_worker_thread *worker)
-{
-	int freeit = 0;
-
-	spin_lock_irq(&worker->lock);
-	spin_lock(&worker->workers->lock);
-	if (worker->workers->num_workers > 1 &&
-	    worker->idle &&
-	    !worker->working &&
-	    !list_empty(&worker->worker_list) &&
-	    list_empty(&worker->prio_pending) &&
-	    list_empty(&worker->pending) &&
-	    atomic_read(&worker->num_pending) == 0) {
-		freeit = 1;
-		list_del_init(&worker->worker_list);
-		worker->workers->num_workers--;
+		trace_btrfs_all_work_done(work);
 	}
-	spin_unlock(&worker->workers->lock);
-	spin_unlock_irq(&worker->lock);
-
-	if (freeit)
-		put_worker(worker);
-	return freeit;
+	spin_unlock_irqrestore(lock, flags);
 }
 
-static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
-					struct list_head *prio_head,
-					struct list_head *head)
-{
-	struct btrfs_work *work = NULL;
-	struct list_head *cur = NULL;
-
-	if (!list_empty(prio_head))
-		cur = prio_head->next;
-
-	smp_mb();
-	if (!list_empty(&worker->prio_pending))
-		goto refill;
-
-	if (!list_empty(head))
-		cur = head->next;
-
-	if (cur)
-		goto out;
-
-refill:
-	spin_lock_irq(&worker->lock);
-	list_splice_tail_init(&worker->prio_pending, prio_head);
-	list_splice_tail_init(&worker->pending, head);
-
-	if (!list_empty(prio_head))
-		cur = prio_head->next;
-	else if (!list_empty(head))
-		cur = head->next;
-	spin_unlock_irq(&worker->lock);
-
-	if (!cur)
-		goto out_fail;
-
-out:
-	work = list_entry(cur, struct btrfs_work, list);
-
-out_fail:
-	return work;
-}
-
-/*
- * main loop for servicing work items
- */
-static int worker_loop(void *arg)
+static void normal_work_helper(struct work_struct *arg)
 {
-	struct btrfs_worker_thread *worker = arg;
-	struct list_head head;
-	struct list_head prio_head;
 	struct btrfs_work *work;
+	struct __btrfs_workqueue *wq;
+	int need_order = 0;
 
-	INIT_LIST_HEAD(&head);
-	INIT_LIST_HEAD(&prio_head);
-
-	do {
-again:
-		while (1) {
-
-
-			work = get_next_work(worker, &prio_head, &head);
-			if (!work)
-				break;
-
-			list_del(&work->list);
-			clear_bit(WORK_QUEUED_BIT, &work->flags);
-
-			work->worker = worker;
-
-			work->func(work);
-
-			atomic_dec(&worker->num_pending);
-			/*
-			 * unless this is an ordered work queue,
-			 * 'work' was probably freed by func above.
-			 */
-			run_ordered_completions(worker->workers, work);
-
-			check_pending_worker_creates(worker);
-			cond_resched();
-		}
-
-		spin_lock_irq(&worker->lock);
-		check_idle_worker(worker);
-
-		if (freezing(current)) {
-			worker->working = 0;
-			spin_unlock_irq(&worker->lock);
-			try_to_freeze();
-		} else {
-			spin_unlock_irq(&worker->lock);
-			if (!kthread_should_stop()) {
-				cpu_relax();
-				/*
-				 * we've dropped the lock, did someone else
-				 * jump_in?
-				 */
-				smp_mb();
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					continue;
-
-				/*
-				 * this short schedule allows more work to
-				 * come in without the queue functions
-				 * needing to go through wake_up_process()
-				 *
-				 * worker->working is still 1, so nobody
-				 * is going to try and wake us up
-				 */
-				schedule_timeout(1);
-				smp_mb();
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					continue;
-
-				if (kthread_should_stop())
-					break;
-
-				/* still no more work?, sleep for real */
-				spin_lock_irq(&worker->lock);
-				set_current_state(TASK_INTERRUPTIBLE);
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending)) {
-					spin_unlock_irq(&worker->lock);
-					set_current_state(TASK_RUNNING);
-					goto again;
-				}
-
-				/*
-				 * this makes sure we get a wakeup when someone
-				 * adds something new to the queue
-				 */
-				worker->working = 0;
-				spin_unlock_irq(&worker->lock);
-
-				if (!kthread_should_stop()) {
-					schedule_timeout(HZ * 120);
-					if (!worker->working &&
-					    try_worker_shutdown(worker)) {
-						return 0;
-					}
-				}
-			}
-			__set_current_state(TASK_RUNNING);
-		}
-	} while (!kthread_should_stop());
-	return 0;
-}
-
-/*
- * this will wait for all the worker threads to shutdown
- */
-void btrfs_stop_workers(struct btrfs_workers *workers)
-{
-	struct list_head *cur;
-	struct btrfs_worker_thread *worker;
-	int can_stop;
-
-	spin_lock_irq(&workers->lock);
-	workers->stopping = 1;
-	list_splice_init(&workers->idle_list, &workers->worker_list);
-	while (!list_empty(&workers->worker_list)) {
-		cur = workers->worker_list.next;
-		worker = list_entry(cur, struct btrfs_worker_thread,
-				    worker_list);
-
-		atomic_inc(&worker->refs);
-		workers->num_workers -= 1;
-		if (!list_empty(&worker->worker_list)) {
-			list_del_init(&worker->worker_list);
-			put_worker(worker);
-			can_stop = 1;
-		} else
-			can_stop = 0;
-		spin_unlock_irq(&workers->lock);
-		if (can_stop)
-			kthread_stop(worker->task);
-		spin_lock_irq(&workers->lock);
-		put_worker(worker);
+	work = container_of(arg, struct btrfs_work, normal_work);
+	/*
+	 * We should not touch things inside work in the following cases:
+	 * 1) after work->func() if it has no ordered_free
+	 *    Since the struct is freed in work->func().
+	 * 2) after setting WORK_DONE_BIT
+	 *    The work may be freed in other threads almost instantly.
+	 * So we save the needed things here.
+	 */
+	if (work->ordered_func)
+		need_order = 1;
+	wq = work->wq;
+
+	trace_btrfs_work_sched(work);
+	thresh_exec_hook(wq);
+	work->func(work);
+	if (need_order) {
+		set_bit(WORK_DONE_BIT, &work->flags);
+		run_ordered_work(wq);
 	}
-	spin_unlock_irq(&workers->lock);
+	if (!need_order)
+		trace_btrfs_all_work_done(work);
 }
 
-/*
- * simple init on struct btrfs_workers
- */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
-			struct btrfs_workers *async_helper)
+void btrfs_init_work(struct btrfs_work *work,
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free)
 {
-	workers->num_workers = 0;
-	workers->num_workers_starting = 0;
-	INIT_LIST_HEAD(&workers->worker_list);
-	INIT_LIST_HEAD(&workers->idle_list);
-	INIT_LIST_HEAD(&workers->order_list);
-	INIT_LIST_HEAD(&workers->prio_order_list);
-	spin_lock_init(&workers->lock);
-	spin_lock_init(&workers->order_lock);
-	workers->max_workers = max;
-	workers->idle_thresh = 32;
-	workers->name = name;
-	workers->ordered = 0;
-	workers->atomic_start_pending = 0;
-	workers->atomic_worker_start = async_helper;
-	workers->stopping = 0;
+	work->func = func;
+	work->ordered_func = ordered_func;
+	work->ordered_free = ordered_free;
+	INIT_WORK(&work->normal_work, normal_work_helper);
+	INIT_LIST_HEAD(&work->ordered_list);
+	work->flags = 0;
 }
 
-/*
- * starts new worker threads.  This does not enforce the max worker
- * count in case you need to temporarily go past it.
- */
-static int __btrfs_start_workers(struct btrfs_workers *workers)
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+				      struct btrfs_work *work)
 {
-	struct btrfs_worker_thread *worker;
-	int ret = 0;
-
-	worker = kzalloc(sizeof(*worker), GFP_NOFS);
-	if (!worker) {
-		ret = -ENOMEM;
-		goto fail;
-	}
-
-	INIT_LIST_HEAD(&worker->pending);
-	INIT_LIST_HEAD(&worker->prio_pending);
-	INIT_LIST_HEAD(&worker->worker_list);
-	spin_lock_init(&worker->lock);
-
-	atomic_set(&worker->num_pending, 0);
-	atomic_set(&worker->refs, 1);
-	worker->workers = workers;
-	worker->task = kthread_create(worker_loop, worker,
-				      "btrfs-%s-%d", workers->name,
-				      workers->num_workers + 1);
-	if (IS_ERR(worker->task)) {
-		ret = PTR_ERR(worker->task);
-		goto fail;
-	}
+	unsigned long flags;
 
-	spin_lock_irq(&workers->lock);
-	if (workers->stopping) {
-		spin_unlock_irq(&workers->lock);
-		ret = -EINVAL;
-		goto fail_kthread;
+	work->wq = wq;
+	thresh_queue_hook(wq);
+	if (work->ordered_func) {
+		spin_lock_irqsave(&wq->list_lock, flags);
+		list_add_tail(&work->ordered_list, &wq->ordered_list);
+		spin_unlock_irqrestore(&wq->list_lock, flags);
 	}
-	list_add_tail(&worker->worker_list, &workers->idle_list);
-	worker->idle = 1;
-	workers->num_workers++;
-	workers->num_workers_starting--;
-	WARN_ON(workers->num_workers_starting < 0);
-	spin_unlock_irq(&workers->lock);
-
-	wake_up_process(worker->task);
-	return 0;
-
-fail_kthread:
-	kthread_stop(worker->task);
-fail:
-	kfree(worker);
-	spin_lock_irq(&workers->lock);
-	workers->num_workers_starting--;
-	spin_unlock_irq(&workers->lock);
-	return ret;
+	queue_work(wq->normal_wq, &work->normal_work);
+	trace_btrfs_work_queued(work);
 }
 
-int btrfs_start_workers(struct btrfs_workers *workers)
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work)
 {
-	spin_lock_irq(&workers->lock);
-	workers->num_workers_starting++;
-	spin_unlock_irq(&workers->lock);
-	return __btrfs_start_workers(workers);
-}
-
-/*
- * run through the list and find a worker thread that doesn't have a lot
- * to do right now.  This can return null if we aren't yet at the thread
- * count limit and all of the threads are busy.
- */
-static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
-{
-	struct btrfs_worker_thread *worker;
-	struct list_head *next;
-	int enforce_min;
-
-	enforce_min = (workers->num_workers + workers->num_workers_starting) <
-		workers->max_workers;
-
-	/*
-	 * if we find an idle thread, don't move it to the end of the
-	 * idle list.  This improves the chance that the next submission
-	 * will reuse the same thread, and maybe catch it while it is still
-	 * working
-	 */
-	if (!list_empty(&workers->idle_list)) {
-		next = workers->idle_list.next;
-		worker = list_entry(next, struct btrfs_worker_thread,
-				    worker_list);
-		return worker;
-	}
-	if (enforce_min || list_empty(&workers->worker_list))
-		return NULL;
-
-	/*
-	 * if we pick a busy task, move the task to the end of the list.
-	 * hopefully this will keep things somewhat evenly balanced.
-	 * Do the move in batches based on the sequence number.  This groups
-	 * requests submitted at roughly the same time onto the same worker.
-	 */
-	next = workers->worker_list.next;
-	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-	worker->sequence++;
+	struct __btrfs_workqueue *dest_wq;
 
-	if (worker->sequence % workers->idle_thresh == 0)
-		list_move_tail(next, &workers->worker_list);
-	return worker;
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+		dest_wq = wq->high;
+	else
+		dest_wq = wq->normal;
+	__btrfs_queue_work(dest_wq, work);
 }
 
-/*
- * selects a worker thread to take the next job.  This will either find
- * an idle worker, start a new worker up to the max count, or just return
- * one of the existing busy workers.
- */
-static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
 {
-	struct btrfs_worker_thread *worker;
-	unsigned long flags;
-	struct list_head *fallback;
-	int ret;
-
-	spin_lock_irqsave(&workers->lock, flags);
-again:
-	worker = next_worker(workers);
-
-	if (!worker) {
-		if (workers->num_workers + workers->num_workers_starting >=
-		    workers->max_workers) {
-			goto fallback;
-		} else if (workers->atomic_worker_start) {
-			workers->atomic_start_pending = 1;
-			goto fallback;
-		} else {
-			workers->num_workers_starting++;
-			spin_unlock_irqrestore(&workers->lock, flags);
-			/* we're below the limit, start another worker */
-			ret = __btrfs_start_workers(workers);
-			spin_lock_irqsave(&workers->lock, flags);
-			if (ret)
-				goto fallback;
-			goto again;
-		}
-	}
-	goto found;
-
-fallback:
-	fallback = NULL;
-	/*
-	 * we have failed to find any workers, just
-	 * return the first one we can find.
-	 */
-	if (!list_empty(&workers->worker_list))
-		fallback = workers->worker_list.next;
-	if (!list_empty(&workers->idle_list))
-		fallback = workers->idle_list.next;
-	BUG_ON(!fallback);
-	worker = list_entry(fallback,
-		  struct btrfs_worker_thread, worker_list);
-found:
-	/*
-	 * this makes sure the worker doesn't exit before it is placed
-	 * onto a busy/idle list
-	 */
-	atomic_inc(&worker->num_pending);
-	spin_unlock_irqrestore(&workers->lock, flags);
-	return worker;
+	destroy_workqueue(wq->normal_wq);
+	trace_btrfs_workqueue_destroy(wq);
+	kfree(wq);
 }
 
-/*
- * btrfs_requeue_work just puts the work item back on the tail of the list
- * it was taken from.  It is intended for use with long running work functions
- * that make some progress and want to give the cpu up for others.
- */
-void btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
 {
-	struct btrfs_worker_thread *worker = work->worker;
-	unsigned long flags;
-	int wake = 0;
-
-	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+	if (!wq)
 		return;
-
-	spin_lock_irqsave(&worker->lock, flags);
-	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
-		list_add_tail(&work->list, &worker->prio_pending);
-	else
-		list_add_tail(&work->list, &worker->pending);
-	atomic_inc(&worker->num_pending);
-
-	/* by definition we're busy, take ourselves off the idle
-	 * list
-	 */
-	if (worker->idle) {
-		spin_lock(&worker->workers->lock);
-		worker->idle = 0;
-		list_move_tail(&worker->worker_list,
-			      &worker->workers->worker_list);
-		spin_unlock(&worker->workers->lock);
-	}
-	if (!worker->working) {
-		wake = 1;
-		worker->working = 1;
-	}
-
-	if (wake)
-		wake_up_process(worker->task);
-	spin_unlock_irqrestore(&worker->lock, flags);
+	if (wq->high)
+		__btrfs_destroy_workqueue(wq->high);
+	__btrfs_destroy_workqueue(wq->normal);
+	kfree(wq);
 }
 
-void btrfs_set_work_high_prio(struct btrfs_work *work)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
 {
-	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+	wq->normal->max_active = max;
+	if (wq->high)
+		wq->high->max_active = max;
 }
 
-/*
- * places a struct btrfs_work into the pending queue of one of the kthreads
- */
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_set_work_high_priority(struct btrfs_work *work)
 {
-	struct btrfs_worker_thread *worker;
-	unsigned long flags;
-	int wake = 0;
-
-	/* don't requeue something already on a list */
-	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-		return;
-
-	worker = find_worker(workers);
-	if (workers->ordered) {
-		/*
-		 * you're not allowed to do ordered queues from an
-		 * interrupt handler
-		 */
-		spin_lock(&workers->order_lock);
-		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
-			list_add_tail(&work->order_list,
-				      &workers->prio_order_list);
-		} else {
-			list_add_tail(&work->order_list, &workers->order_list);
-		}
-		spin_unlock(&workers->order_lock);
-	} else {
-		INIT_LIST_HEAD(&work->order_list);
-	}
-
-	spin_lock_irqsave(&worker->lock, flags);
-
-	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
-		list_add_tail(&work->list, &worker->prio_pending);
-	else
-		list_add_tail(&work->list, &worker->pending);
-	check_busy_worker(worker);
-
-	/*
-	 * avoid calling into wake_up_process if this thread has already
-	 * been kicked
-	 */
-	if (!worker->working)
-		wake = 1;
-	worker->working = 1;
-
-	if (wake)
-		wake_up_process(worker->task);
-	spin_unlock_irqrestore(&worker->lock, flags);
+	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
 
-struct btrfs_worker_thread;
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
 
-/*
- * This is similar to a workqueue, but it is meant to spread the operations
- * across all available cpus instead of just the CPU that was used to
- * queue the work.  There is also some batching introduced to try and
- * cut down on context switches.
- *
- * By default threads are added on demand up to 2 * the number of cpus.
- * Changing struct btrfs_workers->max_workers is one way to prevent
- * demand creation of kthreads.
- *
- * the basic model of these worker threads is to embed a btrfs_work
- * structure in your own data struct, and use container_of in a
- * work function to get back to your data struct.
- */
 struct btrfs_work {
-	/*
-	 * func should be set to the function you want called
-	 * your work struct is passed as the only arg
-	 *
-	 * ordered_func must be set for work sent to an ordered work queue,
-	 * and it is called to complete a given work item in the same
-	 * order they were sent to the queue.
-	 */
-	void (*func)(struct btrfs_work *work);
-	void (*ordered_func)(struct btrfs_work *work);
-	void (*ordered_free)(struct btrfs_work *work);
-
-	/*
-	 * flags should be set to zero.  It is used to make sure the
-	 * struct is only inserted once into the list.
-	 */
+	btrfs_func_t func;
+	btrfs_func_t ordered_func;
+	btrfs_func_t ordered_free;
+
+	/* Don't touch things below */
+	struct work_struct normal_work;
+	struct list_head ordered_list;
+	struct __btrfs_workqueue *wq;
 	unsigned long flags;
-
-	/* don't touch these */
-	struct btrfs_worker_thread *worker;
-	struct list_head list;
-	struct list_head order_list;
-};
-
-struct btrfs_workers {
-	/* current number of running workers */
-	int num_workers;
-
-	int num_workers_starting;
-
-	/* max number of workers allowed.  changed by btrfs_start_workers */
-	int max_workers;
-
-	/* once a worker has this many requests or fewer, it is idle */
-	int idle_thresh;
-
-	/* force completions in the order they were queued */
-	int ordered;
-
-	/* more workers required, but in an interrupt handler */
-	int atomic_start_pending;
-
-	/*
-	 * are we allowed to sleep while starting workers or are we required
-	 * to start them at a later time?  If we can't sleep, this indicates
-	 * which queue we need to use to schedule thread creation.
-	 */
-	struct btrfs_workers *atomic_worker_start;
-
-	/* list with all the work threads.  The workers on the idle thread
-	 * may be actively servicing jobs, but they haven't yet hit the
-	 * idle thresh limit above.
-	 */
-	struct list_head worker_list;
-	struct list_head idle_list;
-
-	/*
-	 * when operating in ordered mode, this maintains the list
-	 * of work items waiting for completion
-	 */
-	struct list_head order_list;
-	struct list_head prio_order_list;
-
-	/* lock for finding the next worker thread to queue on */
-	spinlock_t lock;
-
-	/* lock for the ordered lists */
-	spinlock_t order_lock;
-
-	/* extra name for this worker, used for current->name */
-	char *name;
-
-	int stopping;
 };
 
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers);
-void btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
-			struct btrfs_workers *async_starter);
-void btrfs_requeue_work(struct btrfs_work *work);
-void btrfs_set_work_high_prio(struct btrfs_work *work);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+					      int flags,
+					      int max_active,
+					      int thresh);
+void btrfs_init_work(struct btrfs_work *work,
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..aad7201ad11b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents, struct __prelim_ref *ref,
-			   int level, u64 time_seq, const u64 *extent_item_pos)
+			   int level, u64 time_seq, const u64 *extent_item_pos,
+			   u64 total_refs)
 {
 	int ret = 0;
 	int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
 		ret = btrfs_next_old_leaf(root, path, time_seq);
 
-	while (!ret && count < ref->count) {
+	while (!ret && count < total_refs) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
 
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 				  struct btrfs_path *path, u64 time_seq,
 				  struct __prelim_ref *ref,
 				  struct ulist *parents,
-				  const u64 *extent_item_pos)
+				  const u64 *extent_item_pos, u64 total_refs)
 {
 	struct btrfs_root *root;
 	struct btrfs_key root_key;
@@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = add_all_parents(root, path, parents, ref, level, time_seq,
-			      extent_item_pos);
+			      extent_item_pos, total_refs);
 out:
 	path->lowest_level = 0;
 	btrfs_release_path(path);
@@ -374,7 +375,7 @@ out:
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				   struct btrfs_path *path, u64 time_seq,
 				   struct list_head *head,
-				   const u64 *extent_item_pos)
+				   const u64 *extent_item_pos, u64 total_refs)
 {
 	int err;
 	int ret = 0;
@@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		if (ref->count == 0)
 			continue;
 		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
-					     parents, extent_item_pos);
+					     parents, extent_item_pos,
+					     total_refs);
 		/*
 		 * we can only tolerate ENOENT,otherwise,we should catch error
 		 * and return directly.
@@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode)
  * smaller or equal that seq to the list
  */
 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-			      struct list_head *prefs)
+			      struct list_head *prefs, u64 *total_refs)
 {
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct rb_node *n = &head->node.rb_node;
@@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		default:
 			BUG_ON(1);
 		}
+		*total_refs += (node->ref_mod * sgn);
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
 			struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  */
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			     struct btrfs_path *path, u64 bytenr,
-			     int *info_level, struct list_head *prefs)
+			     int *info_level, struct list_head *prefs,
+			     u64 *total_refs)
 {
 	int ret = 0;
 	int slot;
@@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 	flags = btrfs_extent_flags(leaf, ei);
+	*total_refs += btrfs_extent_refs(leaf, ei);
 	btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 	ptr = (unsigned long)(ei + 1);
@@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct list_head prefs;
 	struct __prelim_ref *ref;
 	struct extent_inode_elem *eie = NULL;
+	u64 total_refs = 0;
 
 	INIT_LIST_HEAD(&prefs);
 	INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +879,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	if (!trans)
+	if (!trans) {
 		path->search_commit_root = 1;
+		path->skip_locking = 1;
+	}
 
 	/*
 	 * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +923,7 @@ again:
 			}
 			spin_unlock(&delayed_refs->lock);
 			ret = __add_delayed_refs(head, time_seq,
-						 &prefs_delayed);
+						 &prefs_delayed, &total_refs);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -936,7 +944,8 @@ again:
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = __add_inline_refs(fs_info, path, bytenr,
-						&info_level, &prefs);
+						&info_level, &prefs,
+						&total_refs);
 			if (ret)
 				goto out;
 			ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +965,7 @@ again:
 	__merge_refs(&prefs, 1);
 
 	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
-				      extent_item_pos);
+				      extent_item_pos, total_refs);
 	if (ret)
 		goto out;
 
@@ -965,7 +974,7 @@ again:
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
 		WARN_ON(ref->count < 0);
-		if (ref->count && ref->root_id && ref->parent == 0) {
+		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			/* no parent == root of tree */
 			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
 			if (ret < 0)
@@ -1061,22 +1070,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 				u64 time_seq, struct ulist **leafs,
 				const u64 *extent_item_pos)
 {
-	struct ulist *tmp;
 	int ret;
 
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
 	*leafs = ulist_alloc(GFP_NOFS);
-	if (!*leafs) {
-		ulist_free(tmp);
+	if (!*leafs)
 		return -ENOMEM;
-	}
 
 	ret = find_parent_nodes(trans, fs_info, bytenr,
-				time_seq, *leafs, tmp, extent_item_pos);
-	ulist_free(tmp);
-
+				time_seq, *leafs, NULL, extent_item_pos);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
@@ -1333,38 +1334,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	if (ret < 0)
 		return ret;
 
-	while (1) {
-		u32 nritems;
-		if (path->slots[0] == 0) {
-			btrfs_set_path_blocking(path);
-			ret = btrfs_prev_leaf(fs_info->extent_root, path);
-			if (ret != 0) {
-				if (ret > 0) {
-					pr_debug("logical %llu is not within "
-						 "any extent\n", logical);
-					ret = -ENOENT;
-				}
-				return ret;
-			}
-		} else {
-			path->slots[0]--;
-		}
-		nritems = btrfs_header_nritems(path->nodes[0]);
-		if (nritems == 0) {
-			pr_debug("logical %llu is not within any extent\n",
-				 logical);
-			return -ENOENT;
-		}
-		if (path->slots[0] == nritems)
-			path->slots[0]--;
-
-		btrfs_item_key_to_cpu(path->nodes[0], found_key,
-				      path->slots[0]);
-		if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
-		    found_key->type == BTRFS_METADATA_ITEM_KEY)
-			break;
+	ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		return ret;
 	}
-
+	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
 	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
 		size = fs_info->extent_root->leafsize;
 	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
 	u64 last_trans;
 
 	/*
-	 * log transid when this inode was last modified
+	 * transid that last logged this inode
 	 */
-	u64 last_sub_trans;
+	u64 logged_trans;
 
 	/*
-	 * transid that last logged this inode
+	 * log transid when this inode was last modified
 	 */
-	u64 logged_trans;
+	int last_sub_trans;
+
+	/* a local copy of root's last_log_commit */
+	int last_log_commit;
 
 	/* total number of bytes pending delalloc, used by stat to calc the
 	 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
 	/* flags field from the on disk inode */
 	u32 flags;
 
-	/* a local copy of root's last_log_commit */
-	unsigned long last_log_commit;
-
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..88d1b1eedc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 	int advance_right;
 	u64 left_blockptr;
 	u64 right_blockptr;
+	u64 left_gen;
+	u64 right_gen;
 	u64 left_start_ctransid;
 	u64 right_start_ctransid;
 	u64 ctransid;
@@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 				right_blockptr = btrfs_node_blockptr(
 						right_path->nodes[right_level],
 						right_path->slots[right_level]);
-				if (left_blockptr == right_blockptr) {
+				left_gen = btrfs_node_ptr_generation(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_gen = btrfs_node_ptr_generation(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				if (left_blockptr == right_blockptr &&
+				    left_gen == right_gen) {
 					/*
 					 * As we're on a shared block, don't
 					 * allow to go deeper.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..bc96c03dd259 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FS_STATE_ERROR		0
 #define BTRFS_FS_STATE_REMOUNTING	1
 #define BTRFS_FS_STATE_TRANS_ABORTED	2
+#define BTRFS_FS_STATE_DEV_REPLACING	3
 
 /* Super block flags */
 /* Errors detected */
@@ -1489,6 +1490,7 @@ struct btrfs_fs_info {
 	 */
 	struct list_head ordered_roots;
 
+	struct mutex delalloc_root_mutex;
 	spinlock_t delalloc_root_lock;
 	/* all fs/file tree roots that have delalloc inodes. */
 	struct list_head delalloc_roots;
@@ -1503,28 +1505,27 @@ struct btrfs_fs_info {
 	 * A third pool does submit_bio to avoid deadlocking with the other
 	 * two
 	 */
-	struct btrfs_workers generic_worker;
-	struct btrfs_workers workers;
-	struct btrfs_workers delalloc_workers;
-	struct btrfs_workers flush_workers;
-	struct btrfs_workers endio_workers;
-	struct btrfs_workers endio_meta_workers;
-	struct btrfs_workers endio_raid56_workers;
-	struct btrfs_workers rmw_workers;
-	struct btrfs_workers endio_meta_write_workers;
-	struct btrfs_workers endio_write_workers;
-	struct btrfs_workers endio_freespace_worker;
-	struct btrfs_workers submit_workers;
-	struct btrfs_workers caching_workers;
-	struct btrfs_workers readahead_workers;
+	struct btrfs_workqueue *workers;
+	struct btrfs_workqueue *delalloc_workers;
+	struct btrfs_workqueue *flush_workers;
+	struct btrfs_workqueue *endio_workers;
+	struct btrfs_workqueue *endio_meta_workers;
+	struct btrfs_workqueue *endio_raid56_workers;
+	struct btrfs_workqueue *rmw_workers;
+	struct btrfs_workqueue *endio_meta_write_workers;
+	struct btrfs_workqueue *endio_write_workers;
+	struct btrfs_workqueue *endio_freespace_worker;
+	struct btrfs_workqueue *submit_workers;
+	struct btrfs_workqueue *caching_workers;
+	struct btrfs_workqueue *readahead_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
 	 * the cow mechanism and make them safe to write.  It happens
 	 * for the sys_munmap function call path
 	 */
-	struct btrfs_workers fixup_workers;
-	struct btrfs_workers delayed_workers;
+	struct btrfs_workqueue *fixup_workers;
+	struct btrfs_workqueue *delayed_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
@@ -1604,9 +1605,9 @@ struct btrfs_fs_info {
 	atomic_t scrub_cancel_req;
 	wait_queue_head_t scrub_pause_wait;
 	int scrub_workers_refcnt;
-	struct btrfs_workers scrub_workers;
-	struct btrfs_workers scrub_wr_completion_workers;
-	struct btrfs_workers scrub_nocow_workers;
+	struct btrfs_workqueue *scrub_workers;
+	struct btrfs_workqueue *scrub_wr_completion_workers;
+	struct btrfs_workqueue *scrub_nocow_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
@@ -1647,7 +1648,7 @@ struct btrfs_fs_info {
 	/* qgroup rescan items */
 	struct mutex qgroup_rescan_lock; /* protects the progress item */
 	struct btrfs_key qgroup_rescan_progress;
-	struct btrfs_workers qgroup_rescan_workers;
+	struct btrfs_workqueue *qgroup_rescan_workers;
 	struct completion qgroup_rescan_completion;
 	struct btrfs_work qgroup_rescan_work;
 
@@ -1674,10 +1675,18 @@ struct btrfs_fs_info {
 
 	atomic_t mutually_exclusive_operation_running;
 
+	struct percpu_counter bio_counter;
+	wait_queue_head_t replace_wait;
+
 	struct semaphore uuid_tree_rescan_sem;
 	unsigned int update_uuid_tree_gen:1;
 };
 
+struct btrfs_subvolume_writers {
+	struct percpu_counter	counter;
+	wait_queue_head_t	wait;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -1714,11 +1723,15 @@ struct btrfs_root {
 	struct mutex log_mutex;
 	wait_queue_head_t log_writer_wait;
 	wait_queue_head_t log_commit_wait[2];
+	struct list_head log_ctxs[2];
 	atomic_t log_writers;
 	atomic_t log_commit[2];
 	atomic_t log_batch;
-	unsigned long log_transid;
-	unsigned long last_log_commit;
+	int log_transid;
+	/* No matter the commit succeeds or not*/
+	int log_transid_committed;
+	/* Just be updated when the commit succeeds. */
+	int last_log_commit;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
 
@@ -1793,6 +1806,7 @@ struct btrfs_root {
 	spinlock_t root_item_lock;
 	atomic_t refs;
 
+	struct mutex delalloc_mutex;
 	spinlock_t delalloc_lock;
 	/*
 	 * all of the inodes that have delalloc bytes.  It is possible for
@@ -1802,6 +1816,8 @@ struct btrfs_root {
 	struct list_head delalloc_inodes;
 	struct list_head delalloc_root;
 	u64 nr_delalloc_inodes;
+
+	struct mutex ordered_extent_mutex;
 	/*
 	 * this is used by the balancing code to wait for all the pending
 	 * ordered extents
@@ -1822,6 +1838,8 @@ struct btrfs_root {
 	 * manipulation with the read-only status via SUBVOL_SETFLAGS
 	 */
 	int send_in_progress;
+	struct btrfs_subvolume_writers *subv_writers;
+	atomic_t will_be_snapshoted;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3346,6 +3364,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
+
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -3723,7 +3744,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4027,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress);
 
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+
 /* reada.c */
 struct reada_control {
 	struct btrfs_root	*root;		/* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 		return -ENOMEM;
 
 	async_work->delayed_root = delayed_root;
-	async_work->work.func = btrfs_async_run_delayed_root;
-	async_work->work.flags = 0;
+	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+			NULL, NULL);
 	async_work->nr = nr;
 
-	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
+	btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
 	return 0;
 }
 
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
  */
 static struct btrfs_delayed_ref_head *
 find_ref_head(struct rb_root *root, u64 bytenr,
-	      struct btrfs_delayed_ref_head **last, int return_bigger)
+	      int return_bigger)
 {
 	struct rb_node *n;
 	struct btrfs_delayed_ref_head *entry;
-	int cmp = 0;
 
-again:
 	n = root->rb_node;
 	entry = NULL;
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-		if (last)
-			*last = entry;
 
 		if (bytenr < entry->node.bytenr)
-			cmp = -1;
-		else if (bytenr > entry->node.bytenr)
-			cmp = 1;
-		else
-			cmp = 0;
-
-		if (cmp < 0)
 			n = n->rb_left;
-		else if (cmp > 0)
+		else if (bytenr > entry->node.bytenr)
 			n = n->rb_right;
 		else
 			return entry;
 	}
 	if (entry && return_bigger) {
-		if (cmp > 0) {
+		if (bytenr > entry->node.bytenr) {
 			n = rb_next(&entry->href_node);
 			if (!n)
 				n = rb_first(root);
 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
 					 href_node);
-			bytenr = entry->node.bytenr;
-			return_bigger = 0;
-			goto again;
+			return entry;
 		}
 		return entry;
 	}
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
 
 again:
 	start = delayed_refs->run_delayed_start;
-	head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+	head = find_ref_head(&delayed_refs->href_root, start, 1);
 	if (!head && !loop) {
 		delayed_refs->run_delayed_start = 0;
 		start = 0;
 		loop = true;
-		head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+		head = find_ref_head(&delayed_refs->href_root, start, 1);
 		if (!head)
 			return NULL;
 	} else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 	ref = btrfs_delayed_node_to_head(update);
 	BUG_ON(existing_ref->is_data != ref->is_data);
 
+	spin_lock(&existing_ref->lock);
 	if (ref->must_insert_reserved) {
 		/* if the extent was freed and then
 		 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 	 * only need the lock for this case cause we could be processing it
 	 * currently, for refs we just added we know we're a-ok.
 	 */
-	spin_lock(&existing_ref->lock);
 	existing->ref_mod += update->ref_mod;
 	spin_unlock(&existing_ref->lock);
 }
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
+	return find_ref_head(&delayed_refs->href_root, bytenr, 0);
 }
 
 void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
 	return ret;
 }
 
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+	s64 writers;
+	DEFINE_WAIT(wait);
+
+	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	do {
+		prepare_to_wait(&fs_info->replace_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		writers = percpu_counter_sum(&fs_info->bio_counter);
+		if (writers)
+			schedule();
+		finish_wait(&fs_info->replace_wait, &wait);
+	} while (writers);
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 				       int scrub_ret)
 {
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	src_device = dev_replace->srcdev;
 	btrfs_dev_replace_unlock(dev_replace);
 
-	/* replace old device with new one in mapping tree */
-	if (!scrub_ret)
-		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
-								src_device,
-								tgt_device);
-
 	/*
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
 	 */
-	ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+	ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
 	if (ret) {
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	WARN_ON(ret);
 
 	/* keep away write_all_supers() during the finishing procedure */
+	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	btrfs_dev_replace_lock(dev_replace);
 	dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
 
-	if (scrub_ret) {
+	/* replace old device with new one in mapping tree */
+	if (!scrub_ret) {
+		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+								src_device,
+								tgt_device);
+	} else {
 		printk_in_rcu(KERN_ERR
 			      "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
 			      src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 			      rcu_str_deref(tgt_device->name), scrub_ret);
 		btrfs_dev_replace_unlock(dev_replace);
 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		mutex_unlock(&root->fs_info->chunk_mutex);
 		if (tgt_device)
 			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 
+	btrfs_rm_dev_replace_blocked(fs_info);
+
 	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
 
+	btrfs_rm_dev_replace_unblocked(fs_info);
+
 	/*
 	 * this is again a consistent state where no dev_replace procedure
 	 * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 */
 	btrfs_dev_replace_unlock(dev_replace);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
 
 	/* write back the superblocks */
 	trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
 		mutex_unlock(&dev_replace->lock_management_lock);
 	}
 }
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+	percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+	percpu_counter_dec(&fs_info->bio_counter);
+
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+	DEFINE_WAIT(wait);
+again:
+	percpu_counter_inc(&fs_info->bio_counter);
+	if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+		btrfs_bio_counter_dec(fs_info);
+		wait_event(fs_info->replace_wait,
+			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+				     &fs_info->fs_state));
+		goto again;
+	}
+
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..bd0f752b797b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
 
 	fs_info = end_io_wq->info;
 	end_io_wq->error = err;
-	end_io_wq->work.func = end_workqueue_fn;
-	end_io_wq->work.flags = 0;
+	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
 
 	if (bio->bi_rw & REQ_WRITE) {
 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
-			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_meta_write_workers,
+					 &end_io_wq->work);
 		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
-			btrfs_queue_worker(&fs_info->endio_freespace_worker,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_freespace_worker,
+					 &end_io_wq->work);
 		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			btrfs_queue_worker(&fs_info->endio_raid56_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_raid56_workers,
+					 &end_io_wq->work);
 		else
-			btrfs_queue_worker(&fs_info->endio_write_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_write_workers,
+					 &end_io_wq->work);
 	} else {
 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			btrfs_queue_worker(&fs_info->endio_raid56_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_raid56_workers,
+					 &end_io_wq->work);
 		else if (end_io_wq->metadata)
-			btrfs_queue_worker(&fs_info->endio_meta_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_meta_workers,
+					 &end_io_wq->work);
 		else
-			btrfs_queue_worker(&fs_info->endio_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_workers,
+					 &end_io_wq->work);
 	}
 }
 
@@ -738,7 +737,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 {
 	unsigned long limit = min_t(unsigned long,
-				    info->workers.max_workers,
+				    info->thread_pool_size,
 				    info->fs_devices->open_devices);
 	return 256 * limit;
 }
@@ -811,11 +810,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_start = submit_bio_start;
 	async->submit_bio_done = submit_bio_done;
 
-	async->work.func = run_one_async_start;
-	async->work.ordered_func = run_one_async_done;
-	async->work.ordered_free = run_one_async_free;
+	btrfs_init_work(&async->work, run_one_async_start,
+			run_one_async_done, run_one_async_free);
 
-	async->work.flags = 0;
 	async->bio_flags = bio_flags;
 	async->bio_offset = bio_offset;
 
@@ -824,9 +821,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	atomic_inc(&fs_info->nr_async_submits);
 
 	if (rw & REQ_SYNC)
-		btrfs_set_work_high_prio(&async->work);
+		btrfs_set_work_high_priority(&async->work);
 
-	btrfs_queue_worker(&fs_info->workers, &async->work);
+	btrfs_queue_work(fs_info->workers, &async->work);
 
 	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1146,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	}
 }
 
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+	struct btrfs_subvolume_writers *writers;
+	int ret;
+
+	writers = kmalloc(sizeof(*writers), GFP_NOFS);
+	if (!writers)
+		return ERR_PTR(-ENOMEM);
+
+	ret = percpu_counter_init(&writers->counter, 0);
+	if (ret < 0) {
+		kfree(writers);
+		return ERR_PTR(ret);
+	}
+
+	init_waitqueue_head(&writers->wait);
+	return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+	percpu_counter_destroy(&writers->counter);
+	kfree(writers);
+}
+
 static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			 u32 stripesize, struct btrfs_root *root,
 			 struct btrfs_fs_info *fs_info,
@@ -1194,16 +1217,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->log_extents_lock[1]);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	mutex_init(&root->ordered_extent_mutex);
+	mutex_init(&root->delalloc_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
 	init_waitqueue_head(&root->log_commit_wait[0]);
 	init_waitqueue_head(&root->log_commit_wait[1]);
+	INIT_LIST_HEAD(&root->log_ctxs[0]);
+	INIT_LIST_HEAD(&root->log_ctxs[1]);
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
 	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
 	atomic_set(&root->refs, 1);
+	atomic_set(&root->will_be_snapshoted, 0);
 	root->log_transid = 0;
+	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	if (fs_info)
 		extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1446,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
 	root->log_transid = 0;
+	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	return 0;
 }
@@ -1498,6 +1528,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 int btrfs_init_fs_root(struct btrfs_root *root)
 {
 	int ret;
+	struct btrfs_subvolume_writers *writers;
 
 	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
 	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,6 +1538,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 		goto fail;
 	}
 
+	writers = btrfs_alloc_subvolume_writers();
+	if (IS_ERR(writers)) {
+		ret = PTR_ERR(writers);
+		goto fail;
+	}
+	root->subv_writers = writers;
+
 	btrfs_init_free_ino_ctl(root);
 	mutex_init(&root->fs_commit_mutex);
 	spin_lock_init(&root->cache_lock);
@@ -1514,8 +1552,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 
 	ret = get_anon_bdev(&root->anon_dev);
 	if (ret)
-		goto fail;
+		goto free_writers;
 	return 0;
+
+free_writers:
+	btrfs_free_subvolume_writers(root->subv_writers);
 fail:
 	kfree(root->free_ino_ctl);
 	kfree(root->free_ino_pinned);
@@ -1990,23 +2031,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
 /* helper to cleanup workers */
 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
-	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_raid56_workers);
-	btrfs_stop_workers(&fs_info->rmw_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->delayed_workers);
-	btrfs_stop_workers(&fs_info->caching_workers);
-	btrfs_stop_workers(&fs_info->readahead_workers);
-	btrfs_stop_workers(&fs_info->flush_workers);
-	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+	btrfs_destroy_workqueue(fs_info->fixup_workers);
+	btrfs_destroy_workqueue(fs_info->delalloc_workers);
+	btrfs_destroy_workqueue(fs_info->workers);
+	btrfs_destroy_workqueue(fs_info->endio_workers);
+	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+	btrfs_destroy_workqueue(fs_info->rmw_workers);
+	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+	btrfs_destroy_workqueue(fs_info->submit_workers);
+	btrfs_destroy_workqueue(fs_info->delayed_workers);
+	btrfs_destroy_workqueue(fs_info->caching_workers);
+	btrfs_destroy_workqueue(fs_info->readahead_workers);
+	btrfs_destroy_workqueue(fs_info->flush_workers);
+	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
 }
 
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2137,8 @@ int open_ctree(struct super_block *sb,
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
+	int max_active;
+	int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
 	bool create_uuid_tree;
 	bool check_uuid_tree;
 
@@ -2133,10 +2175,16 @@ int open_ctree(struct super_block *sb,
 		goto fail_dirty_metadata_bytes;
 	}
 
+	ret = percpu_counter_init(&fs_info->bio_counter, 0);
+	if (ret) {
+		err = ret;
+		goto fail_delalloc_bytes;
+	}
+
 	fs_info->btree_inode = new_inode(sb);
 	if (!fs_info->btree_inode) {
 		err = -ENOMEM;
-		goto fail_delalloc_bytes;
+		goto fail_bio_counter;
 	}
 
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2207,7 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->buffer_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->reloc_mutex);
+	mutex_init(&fs_info->delalloc_root_mutex);
 	seqlock_init(&fs_info->profiles_lock);
 
 	init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2260,7 @@ int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->scrub_pause_req, 0);
 	atomic_set(&fs_info->scrubs_paused, 0);
 	atomic_set(&fs_info->scrub_cancel_req, 0);
+	init_waitqueue_head(&fs_info->replace_wait);
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
 	fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2458,104 +2508,68 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
-	btrfs_init_workers(&fs_info->generic_worker,
-			   "genwork", 1, NULL);
-
-	btrfs_init_workers(&fs_info->workers, "worker",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+	max_active = fs_info->thread_pool_size;
 
-	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-			   fs_info->thread_pool_size, NULL);
+	fs_info->workers =
+		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+				      max_active, 16);
 
-	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
-			   fs_info->thread_pool_size, NULL);
+	fs_info->delalloc_workers =
+		btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
 
-	btrfs_init_workers(&fs_info->submit_workers, "submit",
-			   min_t(u64, fs_devices->num_devices,
-			   fs_info->thread_pool_size), NULL);
+	fs_info->flush_workers =
+		btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
 
-	btrfs_init_workers(&fs_info->caching_workers, "cache",
-			   fs_info->thread_pool_size, NULL);
+	fs_info->caching_workers =
+		btrfs_alloc_workqueue("cache", flags, max_active, 0);
 
-	/* a higher idle thresh on the submit workers makes it much more
+	/*
+	 * a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
 	 * devices
 	 */
-	fs_info->submit_workers.idle_thresh = 64;
-
-	fs_info->workers.idle_thresh = 16;
-	fs_info->workers.ordered = 1;
-
-	fs_info->delalloc_workers.idle_thresh = 2;
-	fs_info->delalloc_workers.ordered = 1;
-
-	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_workers, "endio",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_meta_write_workers,
-			   "endio-meta-write", fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_raid56_workers,
-			   "endio-raid56", fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->rmw_workers,
-			   "rmw", fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
-			   1, &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
-			   &fs_info->generic_worker);
+	fs_info->submit_workers =
+		btrfs_alloc_workqueue("submit", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 64);
+
+	fs_info->fixup_workers =
+		btrfs_alloc_workqueue("fixup", flags, 1, 0);
 
 	/*
 	 * endios are largely parallel and should have a very
 	 * low idle thresh
 	 */
-	fs_info->endio_workers.idle_thresh = 4;
-	fs_info->endio_meta_workers.idle_thresh = 4;
-	fs_info->endio_raid56_workers.idle_thresh = 4;
-	fs_info->rmw_workers.idle_thresh = 2;
-
-	fs_info->endio_write_workers.idle_thresh = 2;
-	fs_info->endio_meta_write_workers.idle_thresh = 2;
-	fs_info->readahead_workers.idle_thresh = 2;
-
-	/*
-	 * btrfs_start_workers can really only fail because of ENOMEM so just
-	 * return -ENOMEM if any of these fail.
-	 */
-	ret = btrfs_start_workers(&fs_info->workers);
-	ret |= btrfs_start_workers(&fs_info->generic_worker);
-	ret |= btrfs_start_workers(&fs_info->submit_workers);
-	ret |= btrfs_start_workers(&fs_info->delalloc_workers);
-	ret |= btrfs_start_workers(&fs_info->fixup_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
-	ret |= btrfs_start_workers(&fs_info->rmw_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_write_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
-	ret |= btrfs_start_workers(&fs_info->delayed_workers);
-	ret |= btrfs_start_workers(&fs_info->caching_workers);
-	ret |= btrfs_start_workers(&fs_info->readahead_workers);
-	ret |= btrfs_start_workers(&fs_info->flush_workers);
-	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
-	if (ret) {
+	fs_info->endio_workers =
+		btrfs_alloc_workqueue("endio", flags, max_active, 4);
+	fs_info->endio_meta_workers =
+		btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+	fs_info->endio_meta_write_workers =
+		btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+	fs_info->endio_raid56_workers =
+		btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+	fs_info->rmw_workers =
+		btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+	fs_info->endio_write_workers =
+		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+	fs_info->endio_freespace_worker =
+		btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+	fs_info->delayed_workers =
+		btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+	fs_info->readahead_workers =
+		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+	fs_info->qgroup_rescan_workers =
+		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+
+	if (!(fs_info->workers && fs_info->delalloc_workers &&
+	      fs_info->submit_workers && fs_info->flush_workers &&
+	      fs_info->endio_workers && fs_info->endio_meta_workers &&
+	      fs_info->endio_meta_write_workers &&
+	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+	      fs_info->caching_workers && fs_info->readahead_workers &&
+	      fs_info->fixup_workers && fs_info->delayed_workers &&
+	      fs_info->qgroup_rescan_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
@@ -2963,6 +2977,8 @@ fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 	iput(fs_info->btree_inode);
+fail_bio_counter:
+	percpu_counter_destroy(&fs_info->bio_counter);
 fail_delalloc_bytes:
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
@@ -3244,6 +3260,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
 	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
 		if (!dev->bdev) {
 			errors_send++;
 			continue;
@@ -3258,6 +3276,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 	/* wait for all the barriers */
 	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
 		if (!dev->bdev) {
 			errors_wait++;
 			continue;
@@ -3477,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
 	root->orphan_block_rsv = NULL;
 	if (root->anon_dev)
 		free_anon_bdev(root->anon_dev);
+	if (root->subv_writers)
+		btrfs_free_subvolume_writers(root->subv_writers);
 	free_extent_buffer(root->node);
 	free_extent_buffer(root->commit_root);
 	kfree(root->free_ino_ctl);
@@ -3610,6 +3632,7 @@ int close_ctree(struct btrfs_root *root)
 
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
+	percpu_counter_destroy(&fs_info->bio_counter);
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
@@ -3791,9 +3814,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 		list_move_tail(&root->ordered_root,
 			       &fs_info->ordered_roots);
 
+		spin_unlock(&fs_info->ordered_root_lock);
 		btrfs_destroy_ordered_extents(root);
 
-		cond_resched_lock(&fs_info->ordered_root_lock);
+		cond_resched();
+		spin_lock(&fs_info->ordered_root_lock);
 	}
 	spin_unlock(&fs_info->ordered_root_lock);
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..c6b6a6e3e735 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	caching_ctl->block_group = cache;
 	caching_ctl->progress = cache->key.objectid;
 	atomic_set(&caching_ctl->count, 1);
-	caching_ctl->work.func = caching_thread;
+	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 
 	spin_lock(&cache->lock);
 	/*
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
 	btrfs_get_block_group(cache);
 
-	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
+	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 
 	return ret;
 }
@@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root,
 }
 
 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
-					 unsigned long nr_pages)
+					 unsigned long nr_pages, int nr_items)
 {
 	struct super_block *sb = root->fs_info->sb;
 
@@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 		 * the filesystem is readonly(all dirty pages are written to
 		 * the disk).
 		 */
-		btrfs_start_delalloc_roots(root->fs_info, 0);
+		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
 		if (!current->journal_info)
-			btrfs_wait_ordered_roots(root->fs_info, -1);
+			btrfs_wait_ordered_roots(root->fs_info, nr_items);
 	}
 }
 
@@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	while (delalloc_bytes && loops < 3) {
 		max_reclaim = min(delalloc_bytes, to_reclaim);
 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-		btrfs_writeback_inodes_sb_nr(root, nr_pages);
+		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
 		/*
 		 * We need to wait for the async pages to actually start before
 		 * we do anything.
@@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root,
 		goto commit;
 
 	/* See if there is enough pinned space to make this reservation */
-	spin_lock(&space_info->lock);
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
-				   bytes) >= 0) {
-		spin_unlock(&space_info->lock);
+				   bytes) >= 0)
 		goto commit;
-	}
-	spin_unlock(&space_info->lock);
 
 	/*
 	 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root,
 	if (space_info != delayed_rsv->space_info)
 		return -ENOSPC;
 
-	spin_lock(&space_info->lock);
 	spin_lock(&delayed_rsv->lock);
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
 				   bytes - delayed_rsv->size) >= 0) {
 		spin_unlock(&delayed_rsv->lock);
-		spin_unlock(&space_info->lock);
 		return -ENOSPC;
 	}
 	spin_unlock(&delayed_rsv->lock);
-	spin_unlock(&space_info->lock);
 
 commit:
 	trans = btrfs_join_transaction(root);
@@ -4181,7 +4174,7 @@ static int flush_space(struct btrfs_root *root,
 		break;
 	case FLUSH_DELALLOC:
 	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(root, num_bytes, orig_bytes,
+		shrink_delalloc(root, num_bytes * 2, orig_bytes,
 				state == FLUSH_DELALLOC_WAIT);
 		break;
 	case ALLOC_CHUNK:
@@ -8938,3 +8931,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 	range->len = trimmed;
 	return ret;
 }
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+	percpu_counter_dec(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we wake up
+	 * waiters.
+	 */
+	smp_mb();
+	if (waitqueue_active(&root->subv_writers->wait))
+		wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+	if (unlikely(atomic_read(&root->will_be_snapshoted)))
+		return 0;
+
+	percpu_counter_inc(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we check for snapshot creation.
+	 */
+	smp_mb();
+	if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+		btrfs_end_nocow_write(root);
+		return 0;
+	}
+	return 1;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..ae69a00387e7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
 	}
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+static struct rb_node *tree_insert(struct rb_root *root,
+				   struct rb_node *search_start,
+				   u64 offset,
 				   struct rb_node *node,
 				   struct rb_node ***p_in,
 				   struct rb_node **parent_in)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 		goto do_insert;
 	}
 
+	p = search_start ? &search_start : &root->rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
 
 	set_state_bits(tree, state, bits);
 
-	node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
+	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	prealloc->state = orig->state;
 	orig->start = split;
 
-	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
-			   NULL, NULL);
+	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+			   &prealloc->rb_node, NULL, NULL);
 	if (node) {
 		free_extent_state(prealloc);
 		return -EEXIST;
@@ -2757,7 +2760,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
 
 	if (em_cached && *em_cached) {
 		em = *em_cached;
-		if (em->in_tree && start >= em->start &&
+		if (extent_map_in_tree(em) && start >= em->start &&
 		    start < extent_map_end(em)) {
 			atomic_inc(&em->refs);
 			return em;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
 	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
 	if (!em)
 		return NULL;
-	em->in_tree = 0;
+	RB_CLEAR_NODE(&em->rb_node);
 	em->flags = 0;
 	em->compress_type = BTRFS_COMPRESS_NONE;
 	em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
 		return;
 	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
-		WARN_ON(em->in_tree);
+		WARN_ON(extent_map_in_tree(em));
 		WARN_ON(!list_empty(&em->list));
 		kmem_cache_free(extent_map_cache, em);
 	}
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
-		WARN_ON(!entry->in_tree);
-
 		if (em->start < entry->start)
 			p = &(*p)->rb_left;
 		else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
 		if (end > entry->start && em->start < extent_map_end(entry))
 			return -EEXIST;
 
-	em->in_tree = 1;
 	rb_link_node(&em->rb_node, orig_parent, p);
 	rb_insert_color(&em->rb_node, root);
 	return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 		prev = n;
 		prev_entry = entry;
 
-		WARN_ON(!entry->in_tree);
-
 		if (offset < entry->start)
 			n = n->rb_left;
 		else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->len += merge->len;
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
-			merge->in_tree = 0;
 			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
 			em->mod_start = merge->mod_start;
 			em->generation = max(em->generation, merge->generation);
 
 			rb_erase(&merge->rb_node, &tree->map);
+			RB_CLEAR_NODE(&merge->rb_node);
 			free_extent_map(merge);
 		}
 	}
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		em->len += merge->len;
 		em->block_len += merge->block_len;
 		rb_erase(&merge->rb_node, &tree->map);
-		merge->in_tree = 0;
+		RB_CLEAR_NODE(&merge->rb_node);
 		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
 		em->generation = max(em->generation, merge->generation);
 		free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
 {
 	clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
-	if (em->in_tree)
+	if (extent_map_in_tree(em))
+		try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+					struct extent_map *em,
+					int modified)
+{
+	atomic_inc(&em->refs);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (modified)
+		list_move(&em->list, &tree->modified_extents);
+	else
 		try_merge_map(tree, em);
 }
 
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	if (ret)
 		goto out;
 
-	atomic_inc(&em->refs);
-
-	em->mod_start = em->start;
-	em->mod_len = em->len;
-
-	if (modified)
-		list_move(&em->list, &tree->modified_extents);
-	else
-		try_merge_map(tree, em);
+	setup_extent_mapping(tree, em, modified);
 out:
 	return ret;
 }
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	rb_erase(&em->rb_node, &tree->map);
 	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
 		list_del_init(&em->list);
-	em->in_tree = 0;
+	RB_CLEAR_NODE(&em->rb_node);
 	return ret;
 }
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified)
+{
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+	ASSERT(extent_map_in_tree(cur));
+	if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+		list_del_init(&cur->list);
+	rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+	RB_CLEAR_NODE(&cur->rb_node);
+
+	setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-	unsigned int in_tree;
 	unsigned int compress_type;
 	struct list_head list;
 };
@@ -44,6 +43,11 @@ struct extent_map_tree {
 	rwlock_t lock;
 };
 
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+	return !RB_EMPTY_NODE(&em->rb_node);
+}
+
 static inline u64 extent_map_end(struct extent_map *em)
 {
 	if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em, int modified);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified);
 
 struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7331a230e30b..e1ffb1e22898 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		clear_bit(EXTENT_FLAG_LOGGING, &flags);
 		modified = !list_empty(&em->list);
-		remove_extent_mapping(em_tree, em);
 		if (no_splits)
 			goto next;
 
@@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
-			ret = add_extent_mapping(em_tree, split, modified);
-			BUG_ON(ret); /* Logic error */
+			replace_extent_mapping(em_tree, em, split, modified);
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
@@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				split->orig_block_len = 0;
 			}
 
-			ret = add_extent_mapping(em_tree, split, modified);
-			BUG_ON(ret); /* Logic error */
+			if (extent_map_in_tree(em)) {
+				replace_extent_mapping(em_tree, em, split,
+						       modified);
+			} else {
+				ret = add_extent_mapping(em_tree, split,
+							 modified);
+				ASSERT(ret == 0); /* Logic error */
+			}
 			free_extent_map(split);
 			split = NULL;
 		}
 next:
+		if (extent_map_in_tree(em))
+			remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 
 		/* once for us */
@@ -720,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
-	if (start >= BTRFS_I(inode)->disk_i_size)
+	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
 		modify_tree = 0;
 
 	while (1) {
@@ -798,7 +804,10 @@ next_slot:
 		 */
 		if (start > key.offset && end < extent_end) {
 			BUG_ON(del_nr > 0);
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EINVAL;
+				break;
+			}
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = start;
@@ -841,7 +850,10 @@ next_slot:
 		 *      | -------- extent -------- |
 		 */
 		if (start <= key.offset && end < extent_end) {
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EINVAL;
+				break;
+			}
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = end;
@@ -864,7 +876,10 @@ next_slot:
 		 */
 		if (start > key.offset && end >= extent_end) {
 			BUG_ON(del_nr > 0);
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EINVAL;
+				break;
+			}
 
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
@@ -938,34 +953,42 @@ next_slot:
 		 * Set path->slots[0] to first slot, so that after the delete
 		 * if items are move off from our leaf to its immediate left or
 		 * right neighbor leafs, we end up with a correct and adjusted
-		 * path->slots[0] for our insertion.
+		 * path->slots[0] for our insertion (if replace_extent != 0).
 		 */
 		path->slots[0] = del_slot;
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret)
 			btrfs_abort_transaction(trans, root, ret);
+	}
 
-		leaf = path->nodes[0];
-		/*
-		 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
-		 * is, its contents got pushed to its neighbors), in which case
-		 * it means path->locks[0] == 0
-		 */
-		if (!ret && replace_extent && leafs_visited == 1 &&
-		    path->locks[0] &&
-		    btrfs_leaf_free_space(root, leaf) >=
-		    sizeof(struct btrfs_item) + extent_item_size) {
-
-			key.objectid = ino;
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = start;
-			setup_items_for_insert(root, path, &key,
-					       &extent_item_size,
-					       extent_item_size,
-					       sizeof(struct btrfs_item) +
-					       extent_item_size, 1);
-			*key_inserted = 1;
+	leaf = path->nodes[0];
+	/*
+	 * If btrfs_del_items() was called, it might have deleted a leaf, in
+	 * which case it unlocked our path, so check path->locks[0] matches a
+	 * write lock.
+	 */
+	if (!ret && replace_extent && leafs_visited == 1 &&
+	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+	     path->locks[0] == BTRFS_WRITE_LOCK) &&
+	    btrfs_leaf_free_space(root, leaf) >=
+	    sizeof(struct btrfs_item) + extent_item_size) {
+
+		key.objectid = ino;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = start;
+		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+			struct btrfs_key slot_key;
+
+			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+				path->slots[0]++;
 		}
+		setup_items_for_insert(root, path, &key,
+				       &extent_item_size,
+				       extent_item_size,
+				       sizeof(struct btrfs_item) +
+				       extent_item_size, 1);
+		*key_inserted = 1;
 	}
 
 	if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1369,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
 				 start_pos, last_pos, 0, cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
+		ordered = btrfs_lookup_ordered_range(inode, start_pos,
+						     last_pos - start_pos + 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset <= last_pos) {
-			btrfs_put_ordered_extent(ordered);
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     start_pos, last_pos,
 					     cached_state, GFP_NOFS);
@@ -1358,12 +1381,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
 			}
-			ret = btrfs_wait_ordered_range(inode, start_pos,
-						last_pos - start_pos + 1);
-			if (ret)
-				return ret;
-			else
-				return -EAGAIN;
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			return -EAGAIN;
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1416,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 	u64 num_bytes;
 	int ret;
 
+	ret = btrfs_start_nocow_write(root);
+	if (!ret)
+		return -ENOSPC;
+
 	lockstart = round_down(pos, root->sectorsize);
-	lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
 
 	while (1) {
 		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1439,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
 	if (ret <= 0) {
 		ret = 0;
+		btrfs_end_nocow_write(root);
 	} else {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
-				 NULL, GFP_NOFS);
-		*write_bytes = min_t(size_t, *write_bytes, num_bytes);
+		*write_bytes = min_t(size_t, *write_bytes ,
+				     num_bytes - pos + lockstart);
 	}
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1532,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			if (!only_release_metadata)
 				btrfs_free_reserved_data_space(inode,
 							       reserve_bytes);
+			else
+				btrfs_end_nocow_write(root);
 			break;
 		}
 
@@ -1598,6 +1622,9 @@ again:
 		}
 
 		release_bytes = 0;
+		if (only_release_metadata)
+			btrfs_end_nocow_write(root);
+
 		if (only_release_metadata && copied > 0) {
 			u64 lockstart = round_down(pos, root->sectorsize);
 			u64 lockend = lockstart +
@@ -1624,10 +1651,12 @@ again:
 	kfree(pages);
 
 	if (release_bytes) {
-		if (only_release_metadata)
+		if (only_release_metadata) {
+			btrfs_end_nocow_write(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
-		else
+		} else {
 			btrfs_delalloc_release_space(inode, release_bytes);
+		}
 	}
 
 	return num_written ? num_written : ret;
@@ -1856,8 +1885,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_log_ctx ctx;
+	int ret = 0;
 	bool full_sync = 0;
 
 	trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1981,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 	trans->sync = true;
 
-	ret = btrfs_log_dentry_safe(trans, root, dentry);
+	btrfs_init_log_ctx(&ctx);
+
+	ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
 		ret = 1;
@@ -1971,7 +2003,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
 		if (!ret) {
-			ret = btrfs_sync_log(trans, root);
+			ret = btrfs_sync_log(trans, root, &ctx);
 			if (!ret) {
 				ret = btrfs_end_transaction(trans, root);
 				goto out;
@@ -2157,6 +2189,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+	u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
 
 	ret = btrfs_wait_ordered_range(inode, offset, len);
 	if (ret)
@@ -2172,14 +2205,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	 * entire page.
 	 */
 	if (same_page && len < PAGE_CACHE_SIZE) {
-		if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+		if (offset < ino_size)
 			ret = btrfs_truncate_page(inode, offset, len, 0);
 		mutex_unlock(&inode->i_mutex);
 		return ret;
 	}
 
 	/* zero back part of the first page */
-	if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+	if (offset < ino_size) {
 		ret = btrfs_truncate_page(inode, offset, 0, 0);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2221,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	}
 
 	/* zero the front end of the last page */
-	if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+	if (offset + len < ino_size) {
 		ret = btrfs_truncate_page(inode, offset + len, 0, 1);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2310,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
 
-		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
-		if (ret) {
-			err = ret;
-			break;
+		if (cur_offset < ino_size) {
+			ret = fill_holes(trans, inode, path, cur_offset,
+					 drop_end);
+			if (ret) {
+				err = ret;
+				break;
+			}
 		}
 
 		cur_offset = drop_end;
@@ -2313,10 +2349,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	}
 
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
-	ret = fill_holes(trans, inode, path, cur_offset, drop_end);
-	if (ret) {
-		err = ret;
-		goto out_trans;
+	if (cur_offset < ino_size) {
+		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+		if (ret) {
+			err = ret;
+			goto out_trans;
+		}
 	}
 
 out_trans:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 49ec1398879f..06e9a4152b14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode,
 
 	if (btrfs_is_free_space_inode(inode)) {
 		WARN_ON_ONCE(1);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_unlock;
 	}
 
 	num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->end = cur_end;
 		INIT_LIST_HEAD(&async_cow->extents);
 
-		async_cow->work.func = async_cow_start;
-		async_cow->work.ordered_func = async_cow_submit;
-		async_cow->work.ordered_free = async_cow_free;
-		async_cow->work.flags = 0;
+		btrfs_init_work(&async_cow->work, async_cow_start,
+				async_cow_submit, async_cow_free);
 
 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
 			PAGE_CACHE_SHIFT;
 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
 
-		btrfs_queue_worker(&root->fs_info->delalloc_workers,
-				   &async_cow->work);
+		btrfs_queue_work(root->fs_info->delalloc_workers,
+				 &async_cow->work);
 
 		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
 			wait_event(root->fs_info->async_submit_wait,
@@ -1843,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 
 	SetPageChecked(page);
 	page_cache_get(page);
-	fixup->work.func = btrfs_writepage_fixup_worker;
+	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
 	fixup->page = page;
-	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+	btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
 	return -EBUSY;
 }
 
@@ -2239,6 +2238,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
 		return PTR_ERR(root);
 	}
 
+	if (btrfs_root_readonly(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
 	/* step 2: get inode */
 	key.objectid = backref->inum;
 	key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
-	struct btrfs_workers *workers;
+	struct btrfs_workqueue *workers;
 
 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
 
@@ -2768,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 					    end - start + 1, uptodate))
 		return 0;
 
-	ordered_extent->work.func = finish_ordered_fn;
-	ordered_extent->work.flags = 0;
+	btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
 
 	if (btrfs_is_free_space_inode(inode))
-		workers = &root->fs_info->endio_freespace_worker;
+		workers = root->fs_info->endio_freespace_worker;
 	else
-		workers = &root->fs_info->endio_write_workers;
-	btrfs_queue_worker(workers, &ordered_extent->work);
+		workers = root->fs_info->endio_write_workers;
+	btrfs_queue_work(workers, &ordered_extent->work);
 
 	return 0;
 }
@@ -4924,7 +4927,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
 	struct inode *inode;
 	u64 objectid = 0;
 
-	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+	if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 
 	spin_lock(&root->inode_lock);
 again:
@@ -5799,6 +5803,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -5872,6 +5877,7 @@ out_unlock:
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
+	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	return err;
 }
@@ -5930,6 +5936,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -5996,6 +6003,7 @@ out_fail:
 	btrfs_end_transaction(trans, root);
 	if (drop_on_err)
 		iput(inode);
+	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	return err;
 }
@@ -6550,6 +6558,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	int ret;
 	struct extent_buffer *leaf;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	u64 disk_bytenr;
@@ -6626,6 +6635,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
+
+	num_bytes = min(offset + *len, extent_end) - offset;
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 range_end;
+
+		range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+		ret = test_range_bit(io_tree, offset, range_end,
+				     EXTENT_DELALLOC, 0, NULL);
+		if (ret) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
 	btrfs_release_path(path);
 
 	/*
@@ -6654,7 +6677,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	 */
 	disk_bytenr += backref_offset;
 	disk_bytenr += offset - key.offset;
-	num_bytes = min(offset + *len, extent_end) - offset;
 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
 				goto out;
 	/*
@@ -7024,10 +7046,9 @@ again:
 	if (!ret)
 		goto out_test;
 
-	ordered->work.func = finish_ordered_fn;
-	ordered->work.flags = 0;
-	btrfs_queue_worker(&root->fs_info->endio_write_workers,
-			   &ordered->work);
+	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+	btrfs_queue_work(root->fs_info->endio_write_workers,
+			 &ordered->work);
 out_test:
 	/*
 	 * our bio might span multiple ordered extents.  If we haven't
@@ -7404,15 +7425,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	smp_mb__after_atomic_inc();
 
 	/*
-	 * The generic stuff only does filemap_write_and_wait_range, which isn't
-	 * enough if we've written compressed pages to this area, so we need to
-	 * call btrfs_wait_ordered_range to make absolutely sure that any
-	 * outstanding dirty pages are on disk.
+	 * The generic stuff only does filemap_write_and_wait_range, which
+	 * isn't enough if we've written compressed pages to this area, so
+	 * we need to flush the dirty pages again to make absolutely sure
+	 * that any outstanding dirty pages are on disk.
 	 */
 	count = iov_length(iov, nr_segs);
-	ret = btrfs_wait_ordered_range(inode, offset, count);
-	if (ret)
-		return ret;
+	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+		     &BTRFS_I(inode)->runtime_flags))
+		filemap_fdatawrite_range(inode->i_mapping, offset, count);
 
 	if (rw & WRITE) {
 		/*
@@ -8404,7 +8425,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
 	work->inode = inode;
 	work->wait = wait;
 	work->delay_iput = delay_iput;
-	work->work.func = btrfs_run_delalloc_work;
+	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
 
 	return work;
 }
@@ -8419,7 +8440,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+				   int nr)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -8431,6 +8453,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	INIT_LIST_HEAD(&works);
 	INIT_LIST_HEAD(&splice);
 
+	mutex_lock(&root->delalloc_mutex);
 	spin_lock(&root->delalloc_lock);
 	list_splice_init(&root->delalloc_inodes, &splice);
 	while (!list_empty(&splice)) {
@@ -8453,12 +8476,14 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 			else
 				iput(inode);
 			ret = -ENOMEM;
-			goto out;
+			break;
 		}
 		list_add_tail(&work->list, &works);
-		btrfs_queue_worker(&root->fs_info->flush_workers,
-				   &work->work);
-
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &work->work);
+		ret++;
+		if (nr != -1 && ret >= nr)
+			break;
 		cond_resched();
 		spin_lock(&root->delalloc_lock);
 	}
@@ -8468,18 +8493,13 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 		list_del_init(&work->list);
 		btrfs_wait_and_free_delalloc_work(work);
 	}
-	return 0;
-out:
-	list_for_each_entry_safe(work, next, &works, list) {
-		list_del_init(&work->list);
-		btrfs_wait_and_free_delalloc_work(work);
-	}
 
 	if (!list_empty_careful(&splice)) {
 		spin_lock(&root->delalloc_lock);
 		list_splice_tail(&splice, &root->delalloc_inodes);
 		spin_unlock(&root->delalloc_lock);
 	}
+	mutex_unlock(&root->delalloc_mutex);
 	return ret;
 }
 
@@ -8490,7 +8510,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return -EROFS;
 
-	ret = __start_delalloc_inodes(root, delay_iput);
+	ret = __start_delalloc_inodes(root, delay_iput, -1);
+	if (ret > 0)
+		ret = 0;
 	/*
 	 * the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
@@ -8507,7 +8529,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return ret;
 }
 
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
@@ -8518,9 +8541,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 
 	INIT_LIST_HEAD(&splice);
 
+	mutex_lock(&fs_info->delalloc_root_mutex);
 	spin_lock(&fs_info->delalloc_root_lock);
 	list_splice_init(&fs_info->delalloc_roots, &splice);
-	while (!list_empty(&splice)) {
+	while (!list_empty(&splice) && nr) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					delalloc_root);
 		root = btrfs_grab_fs_root(root);
@@ -8529,15 +8553,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = __start_delalloc_inodes(root, delay_iput);
+		ret = __start_delalloc_inodes(root, delay_iput, nr);
 		btrfs_put_fs_root(root);
-		if (ret)
+		if (ret < 0)
 			goto out;
 
+		if (nr != -1) {
+			nr -= ret;
+			WARN_ON(nr < 0);
+		}
 		spin_lock(&fs_info->delalloc_root_lock);
 	}
 	spin_unlock(&fs_info->delalloc_root_lock);
 
+	ret = 0;
 	atomic_inc(&fs_info->async_submit_draining);
 	while (atomic_read(&fs_info->nr_async_submits) ||
 	      atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8575,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 		    atomic_read(&fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&fs_info->async_submit_draining);
-	return 0;
 out:
 	if (!list_empty_careful(&splice)) {
 		spin_lock(&fs_info->delalloc_root_lock);
 		list_splice_tail(&splice, &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 	}
+	mutex_unlock(&fs_info->delalloc_root_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..0401397b5c92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
 #include "props.h"
 #include "sysfs.h"
 
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+	__u64 sec;
+	__u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+	char	uuid[BTRFS_UUID_SIZE];	/* in */
+	__u64	stransid;		/* in */
+	__u64	rtransid;		/* out */
+	struct btrfs_ioctl_timespec_32 stime; /* in */
+	struct btrfs_ioctl_timespec_32 rtime; /* out */
+	__u64	flags;			/* in */
+	__u64	reserved[16];		/* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+				struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
+
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
 
@@ -585,6 +611,23 @@ fail:
 	return ret;
 }
 
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+	s64 writers;
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(&root->subv_writers->wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		writers = percpu_counter_sum(&root->subv_writers->counter);
+		if (writers)
+			schedule();
+
+		finish_wait(&root->subv_writers->wait, &wait);
+	} while (writers);
+}
+
 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 			   struct dentry *dentry, char *name, int namelen,
 			   u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (!root->ref_cows)
 		return -EINVAL;
 
+	atomic_inc(&root->will_be_snapshoted);
+	smp_mb__after_atomic_inc();
+	btrfs_wait_nocow_write(root);
+
 	ret = btrfs_start_delalloc_inodes(root, 0);
 	if (ret)
-		return ret;
+		goto out;
 
 	btrfs_wait_ordered_extents(root, -1);
 
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-	if (!pending_snapshot)
-		return -ENOMEM;
+	if (!pending_snapshot) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
 			     BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 					&pending_snapshot->qgroup_reserved,
 					false);
 	if (ret)
-		goto out;
+		goto free;
 
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
 	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
 					 &pending_snapshot->block_rsv,
 					 pending_snapshot->qgroup_reserved);
-out:
+free:
 	kfree(pending_snapshot);
+out:
+	atomic_dec(&root->will_be_snapshoted);
 	return ret;
 }
 
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
 	min_key.type = BTRFS_EXTENT_DATA_KEY;
 	min_key.offset = *off;
 
-	path->keep_locks = 1;
-
 	while (1) {
+		path->keep_locks = 1;
 		ret = btrfs_search_forward(root, &min_key, path, newer_than);
 		if (ret != 0)
 			goto none;
+		path->keep_locks = 0;
+		btrfs_unlock_up_safe(path, 1);
+process_slot:
 		if (min_key.objectid != ino)
 			goto none;
 		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
 			return 0;
 		}
 
+		path->slots[0]++;
+		if (path->slots[0] < btrfs_header_nritems(leaf)) {
+			btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+			goto process_slot;
+		}
+
 		if (min_key.offset == (u64)-1)
 			goto none;
 
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
+		struct extent_state *cached = NULL;
+		u64 end = start + len - 1;
+
 		/* get the big lock and read metadata off disk */
-		lock_extent(io_tree, start, start + len - 1);
+		lock_extent_bits(io_tree, start, end, 0, &cached);
 		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-		unlock_extent(io_tree, start, start + len - 1);
+		unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
 
 		if (IS_ERR(em))
 			return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
 		return false;
 
 	next = defrag_lookup_extent(inode, em->start + em->len);
-	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+	    (em->block_start + em->block_len == next->block_start))
 		ret = false;
 
 	free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
 		page_start = page_offset(page);
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 		while (1) {
-			lock_extent(tree, page_start, page_end);
+			lock_extent_bits(tree, page_start, page_end,
+					 0, &cached_state);
 			ordered = btrfs_lookup_ordered_extent(inode,
 							      page_start);
-			unlock_extent(tree, page_start, page_end);
+			unlock_extent_cached(tree, page_start, page_end,
+					     &cached_state, GFP_NOFS);
 			if (!ordered)
 				break;
 
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		}
 	}
 
-	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
 		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
 
 	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
 		/* the filemap_flush will queue IO into the worker threads, but
@@ -1573,7 +1642,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 		if (src_inode->i_sb != file_inode(file)->i_sb) {
 			btrfs_info(BTRFS_I(src_inode)->root->fs_info,
 				   "Snapshot src from another FS");
-			ret = -EINVAL;
+			ret = -EXDEV;
 		} else if (!inode_owner_or_capable(src_inode)) {
 			/*
 			 * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1866,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
 	if (di && !IS_ERR(di)) {
 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
 		if (key.objectid == root->root_key.objectid) {
-			ret = -ENOTEMPTY;
+			ret = -EPERM;
+			btrfs_err(root->fs_info, "deleting default subvolume "
+				  "%llu is not allowed", key.objectid);
 			goto out;
 		}
 		btrfs_release_path(path);
@@ -2994,8 +3065,9 @@ process_slot:
 							 new_key.offset + datal,
 							 1);
 				if (ret) {
-					btrfs_abort_transaction(trans, root,
-								ret);
+					if (ret != -EINVAL)
+						btrfs_abort_transaction(trans,
+								root, ret);
 					btrfs_end_transaction(trans, root);
 					goto out;
 				}
@@ -3153,8 +3225,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	 *   decompress into destination's address_space (the file offset
 	 *   may change, so source mapping won't do), then recompress (or
 	 *   otherwise reinsert) a subrange.
-	 * - allow ranges within the same file to be cloned (provided
-	 *   they don't overlap)?
+	 *
+	 * - split destination inode's inline extents.  The inline extents can
+	 *   be either compressed or non-compressed.
 	 */
 
 	/* the destination must be opened for writing */
@@ -4353,10 +4426,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
 	return btrfs_qgroup_wait_for_completion(root->fs_info);
 }
 
-static long btrfs_ioctl_set_received_subvol(struct file *file,
-					    void __user *arg)
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+					    struct btrfs_ioctl_received_subvol_args *sa)
 {
-	struct btrfs_ioctl_received_subvol_args *sa = NULL;
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4456,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 		goto out;
 	}
 
-	sa = memdup_user(arg, sizeof(*sa));
-	if (IS_ERR(sa)) {
-		ret = PTR_ERR(sa);
-		sa = NULL;
-		goto out;
-	}
-
 	/*
 	 * 1 - root item
 	 * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4509,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 		goto out;
 	}
 
+out:
+	up_write(&root->fs_info->subvol_sem);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+						void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+	struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+	int ret = 0;
+
+	args32 = memdup_user(arg, sizeof(*args32));
+	if (IS_ERR(args32)) {
+		ret = PTR_ERR(args32);
+		args32 = NULL;
+		goto out;
+	}
+
+	args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+	if (IS_ERR(args64)) {
+		ret = PTR_ERR(args64);
+		args64 = NULL;
+		goto out;
+	}
+
+	memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+	args64->stransid = args32->stransid;
+	args64->rtransid = args32->rtransid;
+	args64->stime.sec = args32->stime.sec;
+	args64->stime.nsec = args32->stime.nsec;
+	args64->rtime.sec = args32->rtime.sec;
+	args64->rtime.nsec = args32->rtime.nsec;
+	args64->flags = args32->flags;
+
+	ret = _btrfs_ioctl_set_received_subvol(file, args64);
+	if (ret)
+		goto out;
+
+	memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+	args32->stransid = args64->stransid;
+	args32->rtransid = args64->rtransid;
+	args32->stime.sec = args64->stime.sec;
+	args32->stime.nsec = args64->stime.nsec;
+	args32->rtime.sec = args64->rtime.sec;
+	args32->rtime.nsec = args64->rtime.nsec;
+	args32->flags = args64->flags;
+
+	ret = copy_to_user(arg, args32, sizeof(*args32));
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(args32);
+	kfree(args64);
+	return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+					    void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args *sa = NULL;
+	int ret = 0;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		sa = NULL;
+		goto out;
+	}
+
+	ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+	if (ret)
+		goto out;
+
 	ret = copy_to_user(arg, sa, sizeof(*sa));
 	if (ret)
 		ret = -EFAULT;
 
 out:
 	kfree(sa);
-	up_write(&root->fs_info->subvol_sem);
-	mnt_drop_write_file(file);
 	return ret;
 }
 
@@ -4746,7 +4888,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC: {
 		int ret;
 
-		ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+		ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
 		if (ret)
 			return ret;
 		ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4912,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_balance_progress(root, argp);
 	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
 		return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+	case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+		return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
 	case BTRFS_IOC_SEND:
 		return btrfs_ioctl_send(file, argp);
 	case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 	if (!uptodate)
 		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 
-	if (entry->bytes_left == 0)
+	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-	else
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
 		ret = 1;
+	}
 out:
 	if (!ret && cached && entry) {
 		*cached = entry;
@@ -410,10 +413,13 @@ have_entry:
 	if (!uptodate)
 		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 
-	if (entry->bytes_left == 0)
+	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-	else
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
 		ret = 1;
+	}
 out:
 	if (!ret && cached && entry) {
 		*cached = entry;
@@ -424,27 +430,48 @@ out:
 }
 
 /* Needs to either be called under a log transaction or the log_mutex */
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_ordered_extent *ordered;
 	struct rb_node *n;
-	int index = log->log_transid % 2;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
-		spin_lock(&log->log_extents_lock[index]);
-		if (list_empty(&ordered->log_list)) {
-			list_add_tail(&ordered->log_list, &log->logged_list[index]);
-			atomic_inc(&ordered->refs);
-		}
-		spin_unlock(&log->log_extents_lock[index]);
+		if (!list_empty(&ordered->log_list))
+			continue;
+		list_add_tail(&ordered->log_list, logged_list);
+		atomic_inc(&ordered->refs);
 	}
 	spin_unlock_irq(&tree->lock);
 }
 
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	while (!list_empty(logged_list)) {
+		ordered = list_first_entry(logged_list,
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		btrfs_put_ordered_extent(ordered);
+	}
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log)
+{
+	int index = log->log_transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	list_splice_tail(logged_list, &log->logged_list[index]);
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 {
 	struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
 
-	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	mutex_lock(&root->ordered_extent_mutex);
 	spin_lock(&root->ordered_extent_lock);
 	list_splice_init(&root->ordered_extents, &splice);
 	while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 		atomic_inc(&ordered->refs);
 		spin_unlock(&root->ordered_extent_lock);
 
-		ordered->flush_work.func = btrfs_run_ordered_extent_work;
+		btrfs_init_work(&ordered->flush_work,
+				btrfs_run_ordered_extent_work, NULL, NULL);
 		list_add_tail(&ordered->work_list, &works);
-		btrfs_queue_worker(&root->fs_info->flush_workers,
-				   &ordered->flush_work);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &ordered->flush_work);
 
 		cond_resched();
 		spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 		btrfs_put_ordered_extent(ordered);
 		cond_resched();
 	}
-	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+	mutex_unlock(&root->ordered_extent_mutex);
 
 	return count;
 }
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 
 	INIT_LIST_HEAD(&splice);
 
+	mutex_lock(&fs_info->ordered_operations_mutex);
 	spin_lock(&fs_info->ordered_root_lock);
 	list_splice_init(&fs_info->ordered_roots, &splice);
 	while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 	}
 	list_splice_tail(&splice, &fs_info->ordered_roots);
 	spin_unlock(&fs_info->ordered_root_lock);
+	mutex_unlock(&fs_info->ordered_operations_mutex);
 }
 
 /*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 			goto out;
 		}
 		list_add_tail(&work->list, &works);
-		btrfs_queue_worker(&root->fs_info->flush_workers,
-				   &work->work);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &work->work);
 
 		cond_resched();
 		spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct inode *inode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 		ret = qgroup_rescan_init(fs_info, 0, 1);
 		if (!ret) {
 			qgroup_rescan_zero_tracking(fs_info);
-			btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
-					   &fs_info->qgroup_rescan_work);
+			btrfs_queue_work(fs_info->qgroup_rescan_workers,
+					 &fs_info->qgroup_rescan_work);
 		}
 		ret = 0;
 	}
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 
 	memset(&fs_info->qgroup_rescan_work, 0,
 	       sizeof(fs_info->qgroup_rescan_work));
-	fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+	btrfs_init_work(&fs_info->qgroup_rescan_work,
+			btrfs_qgroup_rescan_worker, NULL, NULL);
 
 	if (ret) {
 err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
 
 	qgroup_rescan_zero_tracking(fs_info);
 
-	btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
-			   &fs_info->qgroup_rescan_work);
+	btrfs_queue_work(fs_info->qgroup_rescan_workers,
+			 &fs_info->qgroup_rescan_work);
 
 	return 0;
 }
@@ -2190,6 +2191,6 @@ void
 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
 {
 	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
-		btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
-				   &fs_info->qgroup_rescan_work);
+		btrfs_queue_work(fs_info->qgroup_rescan_workers,
+				 &fs_info->qgroup_rescan_work);
 }
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
 
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 {
-	rbio->work.flags = 0;
-	rbio->work.func = rmw_work;
+	btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
 
-	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
-			   &rbio->work);
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
 }
 
 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 {
-	rbio->work.flags = 0;
-	rbio->work.func = read_rebuild_work;
+	btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
 
-	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
-			   &rbio->work);
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
 }
 
 /*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	plug = container_of(cb, struct btrfs_plug_cb, cb);
 
 	if (from_schedule) {
-		plug->work.flags = 0;
-		plug->work.func = unplug_work;
-		btrfs_queue_worker(&plug->info->rmw_workers,
-				   &plug->work);
+		btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+		btrfs_queue_work(plug->info->rmw_workers,
+				 &plug->work);
 		return;
 	}
 	run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
 		/* FIXME we cannot handle this properly right now */
 		BUG();
 	}
-	rmw->work.func = reada_start_machine_worker;
+	btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
 	rmw->fs_info = fs_info;
 
-	btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
 }
 
 #ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..def428a25b2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
 	       rc->block_group->key.objectid, rc->block_group->flags);
 
-	ret = btrfs_start_delalloc_roots(fs_info, 0);
+	ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/err.h>
 #include <linux/uuid.h>
 #include "ctree.h"
 #include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 		key.offset++;
 
 		root = btrfs_read_fs_root(tree_root, &root_key);
-		err = PTR_RET(root);
+		err = PTR_ERR_OR_ZERO(root);
 		if (err && err != -ENOENT) {
 			break;
 		} else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..93e6d7172844 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 	atomic_inc(&fs_info->scrubs_running);
 	atomic_inc(&fs_info->scrubs_paused);
 	mutex_unlock(&fs_info->scrub_lock);
+
+	/*
+	 * check if @scrubs_running=@scrubs_paused condition
+	 * inside wait_event() is not an atomic operation.
+	 * which means we may inc/dec @scrub_running/paused
+	 * at any time. Let's wake up @scrub_pause_wait as
+	 * much as we can to let commit transaction blocked less.
+	 */
+	wake_up(&fs_info->scrub_pause_wait);
+
 	atomic_inc(&sctx->workers_pending);
 }
 
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 		sbio->index = i;
 		sbio->sctx = sctx;
 		sbio->page_count = 0;
-		sbio->work.func = scrub_bio_end_io_worker;
+		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+				NULL, NULL);
 
 		if (i != SCRUB_BIOS_PER_SCTX - 1)
 			sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
 		fixup_nodatasum->root = fs_info->extent_root;
 		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
 		scrub_pending_trans_workers_inc(sctx);
-		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
-		btrfs_queue_worker(&fs_info->scrub_workers,
-				   &fixup_nodatasum->work);
+		btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+				NULL, NULL);
+		btrfs_queue_work(fs_info->scrub_workers,
+				 &fixup_nodatasum->work);
 		goto out;
 	}
 
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
 	sbio->err = err;
 	sbio->bio = bio;
 
-	sbio->work.func = scrub_wr_bio_end_io_worker;
-	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
 
 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 	sbio->err = err;
 	sbio->bio = bio;
 
-	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
 }
 
 static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2686,10 +2698,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->bios_in_flight) == 0);
-		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+		atomic_inc(&fs_info->scrubs_paused);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		/*
+		 * must be called before we decrease @scrub_paused.
+		 * make sure we don't block transaction commit while
+		 * we are waiting pending workers finished.
+		 */
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->workers_pending) == 0);
-		scrub_blocked_if_needed(fs_info);
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+
+		mutex_lock(&fs_info->scrub_lock);
+		__scrub_blocked_if_needed(fs_info);
+		atomic_dec(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		wake_up(&fs_info->scrub_pause_wait);
 
 		btrfs_put_block_group(cache);
 		if (ret)
@@ -2757,33 +2782,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 						int is_dev_replace)
 {
 	int ret = 0;
+	int flags = WQ_FREEZABLE | WQ_UNBOUND;
+	int max_active = fs_info->thread_pool_size;
 
 	if (fs_info->scrub_workers_refcnt == 0) {
 		if (is_dev_replace)
-			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
-					&fs_info->generic_worker);
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      1, 4);
 		else
-			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
-					fs_info->thread_pool_size,
-					&fs_info->generic_worker);
-		fs_info->scrub_workers.idle_thresh = 4;
-		ret = btrfs_start_workers(&fs_info->scrub_workers);
-		if (ret)
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      max_active, 4);
+		if (!fs_info->scrub_workers) {
+			ret = -ENOMEM;
 			goto out;
-		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
-				   "scrubwrc",
-				   fs_info->thread_pool_size,
-				   &fs_info->generic_worker);
-		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
-		ret = btrfs_start_workers(
-				&fs_info->scrub_wr_completion_workers);
-		if (ret)
+		}
+		fs_info->scrub_wr_completion_workers =
+			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+					      max_active, 2);
+		if (!fs_info->scrub_wr_completion_workers) {
+			ret = -ENOMEM;
 			goto out;
-		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
-				   &fs_info->generic_worker);
-		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
-		if (ret)
+		}
+		fs_info->scrub_nocow_workers =
+			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+		if (!fs_info->scrub_nocow_workers) {
+			ret = -ENOMEM;
 			goto out;
+		}
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
@@ -2793,9 +2820,9 @@ out:
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
 	if (--fs_info->scrub_workers_refcnt == 0) {
-		btrfs_stop_workers(&fs_info->scrub_workers);
-		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
-		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
 	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
@@ -3106,10 +3133,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 	nocow_ctx->len = len;
 	nocow_ctx->mirror_num = mirror_num;
 	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
-	nocow_ctx->work.func = copy_nocow_pages_worker;
+	btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
 	INIT_LIST_HEAD(&nocow_ctx->inodes);
-	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
-			   &nocow_ctx->work);
+	btrfs_queue_work(fs_info->scrub_nocow_workers,
+			 &nocow_ctx->work);
 
 	return 0;
 }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..9b6da9d55f9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
 		struct {
 			char *start;
 			char *end;
-			char *prepared;
 
 			char *buf;
-			int buf_len;
-			unsigned int reversed:1;
-			unsigned int virtual_mem:1;
+			unsigned short buf_len:15;
+			unsigned short reversed:1;
 			char inline_buf[];
 		};
-		char pad[PAGE_SIZE];
+		/*
+		 * Average path length does not exceed 200 bytes, we'll have
+		 * better packing in the slab and higher chance to satisfy
+		 * a allocation later during send.
+		 */
+		char pad[256];
 	};
 };
 #define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
 	int cur_inode_deleted;
 	u64 cur_inode_size;
 	u64 cur_inode_mode;
+	u64 cur_inode_rdev;
 	u64 cur_inode_last_extent;
 
 	u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
 	struct list_head name_cache_list;
 	int name_cache_size;
 
+	struct file_ra_state ra;
+
 	char *read_buf;
 
 	/*
@@ -175,6 +181,47 @@ struct send_ctx {
 	 * own move/rename can be performed.
 	 */
 	struct rb_root waiting_dir_moves;
+
+	/*
+	 * A directory that is going to be rm'ed might have a child directory
+	 * which is in the pending directory moves index above. In this case,
+	 * the directory can only be removed after the move/rename of its child
+	 * is performed. Example:
+	 *
+	 * Parent snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- c/           (ino 259)
+	 *         |   |-- x/       (ino 260)
+	 *         |
+	 *         |-- y/           (ino 261)
+	 *
+	 * Send snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- YY/          (ino 261)
+	 *              |-- x/      (ino 260)
+	 *
+	 * Sequence of steps that lead to the send snapshot:
+	 * rm -f /a/b/c/foo.txt
+	 * mv /a/b/y /a/b/YY
+	 * mv /a/b/c/x /a/b/YY
+	 * rmdir /a/b/c
+	 *
+	 * When the child is processed, its move/rename is delayed until its
+	 * parent is processed (as explained above), but all other operations
+	 * like update utimes, chown, chgrp, etc, are performed and the paths
+	 * that it uses for those operations must use the orphanized name of
+	 * its parent (the directory we're going to rm later), so we need to
+	 * memorize that name.
+	 *
+	 * Indexed by the inode number of the directory to be deleted.
+	 */
+	struct rb_root orphan_dirs;
 };
 
 struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
 struct waiting_dir_move {
 	struct rb_node node;
 	u64 ino;
+	/*
+	 * There might be some directory that could not be removed because it
+	 * was waiting for this directory inode to be moved first. Therefore
+	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+	 */
+	u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+	struct rb_node node;
+	u64 ino;
+	u64 gen;
 };
 
 struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
 
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
 static int need_send_hole(struct send_ctx *sctx)
 {
 	return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
 	if (!p)
 		return NULL;
 	p->reversed = 0;
-	p->virtual_mem = 0;
 	p->buf = p->inline_buf;
 	p->buf_len = FS_PATH_INLINE_SIZE;
 	fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
 {
 	if (!p)
 		return;
-	if (p->buf != p->inline_buf) {
-		if (p->virtual_mem)
-			vfree(p->buf);
-		else
-			kfree(p->buf);
-	}
+	if (p->buf != p->inline_buf)
+		kfree(p->buf);
 	kfree(p);
 }
 
@@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 
 	path_len = p->end - p->start;
 	old_buf_len = p->buf_len;
-	len = PAGE_ALIGN(len);
-
-	if (p->buf == p->inline_buf) {
-		tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
-		if (!tmp_buf) {
-			tmp_buf = vmalloc(len);
-			if (!tmp_buf)
-				return -ENOMEM;
-			p->virtual_mem = 1;
-		}
-		memcpy(tmp_buf, p->buf, p->buf_len);
-		p->buf = tmp_buf;
-		p->buf_len = len;
-	} else {
-		if (p->virtual_mem) {
-			tmp_buf = vmalloc(len);
-			if (!tmp_buf)
-				return -ENOMEM;
-			memcpy(tmp_buf, p->buf, p->buf_len);
-			vfree(p->buf);
-		} else {
-			tmp_buf = krealloc(p->buf, len, GFP_NOFS);
-			if (!tmp_buf) {
-				tmp_buf = vmalloc(len);
-				if (!tmp_buf)
-					return -ENOMEM;
-				memcpy(tmp_buf, p->buf, p->buf_len);
-				kfree(p->buf);
-				p->virtual_mem = 1;
-			}
-		}
-		p->buf = tmp_buf;
-		p->buf_len = len;
-	}
+
+	/*
+	 * First time the inline_buf does not suffice
+	 */
+	if (p->buf == p->inline_buf)
+		tmp_buf = kmalloc(len, GFP_NOFS);
+	else
+		tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+	if (!tmp_buf)
+		return -ENOMEM;
+	p->buf = tmp_buf;
+	/*
+	 * The real size of the buffer is bigger, this will let the fast path
+	 * happen most of the time
+	 */
+	p->buf_len = ksize(p->buf);
+
 	if (p->reversed) {
 		tmp_buf = p->buf + old_buf_len - path_len - 1;
 		p->end = p->buf + p->buf_len - 1;
@@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 	return 0;
 }
 
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+				   char **prepared)
 {
 	int ret;
 	int new_len;
@@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
 		if (p->start != p->end)
 			*--p->start = '/';
 		p->start -= name_len;
-		p->prepared = p->start;
+		*prepared = p->start;
 	} else {
 		if (p->start != p->end)
 			*p->end++ = '/';
-		p->prepared = p->end;
+		*prepared = p->end;
 		p->end += name_len;
 		*p->end = 0;
 	}
@@ -370,12 +413,12 @@ out:
 static int fs_path_add(struct fs_path *p, const char *name, int name_len)
 {
 	int ret;
+	char *prepared;
 
-	ret = fs_path_prepare_for_add(p, name_len);
+	ret = fs_path_prepare_for_add(p, name_len, &prepared);
 	if (ret < 0)
 		goto out;
-	memcpy(p->prepared, name, name_len);
-	p->prepared = NULL;
+	memcpy(prepared, name, name_len);
 
 out:
 	return ret;
@@ -384,12 +427,12 @@ out:
 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
 {
 	int ret;
+	char *prepared;
 
-	ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+	ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
 	if (ret < 0)
 		goto out;
-	memcpy(p->prepared, p2->start, p2->end - p2->start);
-	p->prepared = NULL;
+	memcpy(prepared, p2->start, p2->end - p2->start);
 
 out:
 	return ret;
@@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
 					  unsigned long off, int len)
 {
 	int ret;
+	char *prepared;
 
-	ret = fs_path_prepare_for_add(p, len);
+	ret = fs_path_prepare_for_add(p, len, &prepared);
 	if (ret < 0)
 		goto out;
 
-	read_extent_buffer(eb, p->prepared, off, len);
-	p->prepared = NULL;
+	read_extent_buffer(eb, prepared, off, len);
 
 out:
 	return ret;
@@ -915,9 +958,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	struct btrfs_dir_item *di;
 	struct btrfs_key di_key;
 	char *buf = NULL;
-	char *buf2 = NULL;
-	int buf_len;
-	int buf_virtual = 0;
+	const int buf_len = PATH_MAX;
 	u32 name_len;
 	u32 data_len;
 	u32 cur;
@@ -927,7 +968,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	int num;
 	u8 type;
 
-	buf_len = PAGE_SIZE;
 	buf = kmalloc(buf_len, GFP_NOFS);
 	if (!buf) {
 		ret = -ENOMEM;
@@ -949,30 +989,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 		type = btrfs_dir_type(eb, di);
 		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
 
+		/*
+		 * Path too long
+		 */
 		if (name_len + data_len > buf_len) {
-			buf_len = PAGE_ALIGN(name_len + data_len);
-			if (buf_virtual) {
-				buf2 = vmalloc(buf_len);
-				if (!buf2) {
-					ret = -ENOMEM;
-					goto out;
-				}
-				vfree(buf);
-			} else {
-				buf2 = krealloc(buf, buf_len, GFP_NOFS);
-				if (!buf2) {
-					buf2 = vmalloc(buf_len);
-					if (!buf2) {
-						ret = -ENOMEM;
-						goto out;
-					}
-					kfree(buf);
-					buf_virtual = 1;
-				}
-			}
-
-			buf = buf2;
-			buf2 = NULL;
+			ret = -ENAMETOOLONG;
+			goto out;
 		}
 
 		read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1017,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	}
 
 out:
-	if (buf_virtual)
-		vfree(buf);
-	else
-		kfree(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -1292,8 +1311,6 @@ static int find_extent_clone(struct send_ctx *sctx,
 		extent_item_pos = logical - found_key.objectid;
 	else
 		extent_item_pos = 0;
-
-	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(sctx->send_root->fs_info,
 					found_key.objectid, extent_item_pos, 1,
 					__iterate_backrefs, backref_ctx);
@@ -1418,11 +1435,7 @@ static int gen_unique_name(struct send_ctx *sctx,
 	while (1) {
 		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
 				ino, gen, idx);
-		if (len >= sizeof(tmp)) {
-			/* should really not happen */
-			ret = -EOVERFLOW;
-			goto out;
-		}
+		ASSERT(len < sizeof(tmp));
 
 		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
 				path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1898,13 +1911,20 @@ static void name_cache_delete(struct send_ctx *sctx,
 
 	nce_head = radix_tree_lookup(&sctx->name_cache,
 			(unsigned long)nce->ino);
-	BUG_ON(!nce_head);
+	if (!nce_head) {
+		btrfs_err(sctx->send_root->fs_info,
+	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+			nce->ino, sctx->name_cache_size);
+	}
 
 	list_del(&nce->radix_list);
 	list_del(&nce->list);
 	sctx->name_cache_size--;
 
-	if (list_empty(nce_head)) {
+	/*
+	 * We may not get to the final release of nce_head if the lookup fails
+	 */
+	if (nce_head && list_empty(nce_head)) {
 		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
 		kfree(nce_head);
 	}
@@ -1977,7 +1997,6 @@ static void name_cache_free(struct send_ctx *sctx)
  */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
 				     u64 ino, u64 gen,
-				     int skip_name_cache,
 				     u64 *parent_ino,
 				     u64 *parent_gen,
 				     struct fs_path *dest)
@@ -1987,8 +2006,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	struct btrfs_path *path = NULL;
 	struct name_cache_entry *nce = NULL;
 
-	if (skip_name_cache)
-		goto get_ref;
 	/*
 	 * First check if we already did a call to this function with the same
 	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2050,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 		goto out_cache;
 	}
 
-get_ref:
 	/*
 	 * Depending on whether the inode was already processed or not, use
 	 * send_root or parent_root for ref lookup.
 	 */
-	if (ino < sctx->send_progress && !skip_name_cache)
+	if (ino < sctx->send_progress)
 		ret = get_first_ref(sctx->send_root, ino,
 				    parent_ino, parent_gen, dest);
 	else
@@ -2062,8 +2078,6 @@ get_ref:
 			goto out;
 		ret = 1;
 	}
-	if (skip_name_cache)
-		goto out;
 
 out_cache:
 	/*
@@ -2131,9 +2145,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	u64 parent_inode = 0;
 	u64 parent_gen = 0;
 	int stop = 0;
-	u64 start_ino = ino;
-	u64 start_gen = gen;
-	int skip_name_cache = 0;
 
 	name = fs_path_alloc();
 	if (!name) {
@@ -2141,31 +2152,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 		goto out;
 	}
 
-	if (is_waiting_for_move(sctx, ino))
-		skip_name_cache = 1;
-
-again:
 	dest->reversed = 1;
 	fs_path_reset(dest);
 
 	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
 		fs_path_reset(name);
 
-		ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
-				&parent_inode, &parent_gen, name);
+		if (is_waiting_for_rm(sctx, ino)) {
+			ret = gen_unique_name(sctx, ino, gen, name);
+			if (ret < 0)
+				goto out;
+			ret = fs_path_add_path(dest, name);
+			break;
+		}
+
+		if (is_waiting_for_move(sctx, ino)) {
+			ret = get_first_ref(sctx->parent_root, ino,
+					    &parent_inode, &parent_gen, name);
+		} else {
+			ret = __get_cur_name_and_parent(sctx, ino, gen,
+							&parent_inode,
+							&parent_gen, name);
+			if (ret)
+				stop = 1;
+		}
+
 		if (ret < 0)
 			goto out;
-		if (ret)
-			stop = 1;
-
-		if (!skip_name_cache &&
-		    is_waiting_for_move(sctx, parent_inode)) {
-			ino = start_ino;
-			gen = start_gen;
-			stop = 0;
-			skip_name_cache = 1;
-			goto again;
-		}
 
 		ret = fs_path_add_path(dest, name);
 		if (ret < 0)
@@ -2429,10 +2442,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 	if (!p)
 		return -ENOMEM;
 
-	ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
-			NULL, &rdev);
-	if (ret < 0)
-		goto out;
+	if (ino != sctx->cur_ino) {
+		ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+				     NULL, NULL, &rdev);
+		if (ret < 0)
+			goto out;
+	} else {
+		gen = sctx->cur_inode_gen;
+		mode = sctx->cur_inode_mode;
+		rdev = sctx->cur_inode_rdev;
+	}
 
 	if (S_ISREG(mode)) {
 		cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2531,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
 	key.objectid = dir;
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = 0;
+	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
 	while (1) {
-		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-				1, 0);
-		if (ret < 0)
-			goto out;
-		if (!ret) {
-			eb = path->nodes[0];
-			slot = path->slots[0];
-			btrfs_item_key_to_cpu(eb, &found_key, slot);
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(sctx->send_root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
 		}
-		if (ret || found_key.objectid != key.objectid ||
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (found_key.objectid != key.objectid ||
 		    found_key.type != key.type) {
 			ret = 0;
 			goto out;
@@ -2537,8 +2565,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
 			goto out;
 		}
 
-		key.offset = found_key.offset + 1;
-		btrfs_release_path(path);
+		path->slots[0]++;
 	}
 
 out:
@@ -2590,7 +2617,7 @@ struct recorded_ref {
  * everything mixed. So we first record all refs and later process them.
  * This function is a helper to record one ref.
  */
-static int record_ref(struct list_head *head, u64 dir,
+static int __record_ref(struct list_head *head, u64 dir,
 		      u64 dir_gen, struct fs_path *path)
 {
 	struct recorded_ref *ref;
@@ -2676,12 +2703,78 @@ out:
 	return ret;
 }
 
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node **p = &sctx->orphan_dirs.rb_node;
+	struct rb_node *parent = NULL;
+	struct orphan_dir_info *entry, *odi;
+
+	odi = kmalloc(sizeof(*odi), GFP_NOFS);
+	if (!odi)
+		return ERR_PTR(-ENOMEM);
+	odi->ino = dir_ino;
+	odi->gen = 0;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino) {
+			p = &(*p)->rb_left;
+		} else if (dir_ino > entry->ino) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(odi);
+			return entry;
+		}
+	}
+
+	rb_link_node(&odi->node, parent, p);
+	rb_insert_color(&odi->node, &sctx->orphan_dirs);
+	return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node *n = sctx->orphan_dirs.rb_node;
+	struct orphan_dir_info *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino)
+			n = n->rb_left;
+		else if (dir_ino > entry->ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+	return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+				 struct orphan_dir_info *odi)
+{
+	if (!odi)
+		return;
+	rb_erase(&odi->node, &sctx->orphan_dirs);
+	kfree(odi);
+}
+
 /*
  * Returns 1 if a directory can be removed at this point in time.
  * We check this by iterating all dir items and checking if the inode behind
  * the dir item was already processed.
  */
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+		     u64 send_progress)
 {
 	int ret = 0;
 	struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2797,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
 	key.objectid = dir;
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
 
 	while (1) {
-		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0)
-			goto out;
-		if (!ret) {
-			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-					path->slots[0]);
+		struct waiting_dir_move *dm;
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
 		}
-		if (ret || found_key.objectid != key.objectid ||
-		    found_key.type != key.type) {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type)
 			break;
-		}
 
 		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				struct btrfs_dir_item);
 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
 
+		dm = get_waiting_dir_move(sctx, loc.objectid);
+		if (dm) {
+			struct orphan_dir_info *odi;
+
+			odi = add_orphan_dir_info(sctx, dir);
+			if (IS_ERR(odi)) {
+				ret = PTR_ERR(odi);
+				goto out;
+			}
+			odi->gen = dir_gen;
+			dm->rmdir_ino = dir;
+			ret = 0;
+			goto out;
+		}
+
 		if (loc.objectid > send_progress) {
 			ret = 0;
 			goto out;
 		}
 
-		btrfs_release_path(path);
-		key.offset = found_key.offset + 1;
+		path->slots[0]++;
 	}
 
 	ret = 1;
@@ -2740,19 +2854,9 @@ out:
 
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
 {
-	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
-	struct waiting_dir_move *entry;
+	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
 
-	while (n) {
-		entry = rb_entry(n, struct waiting_dir_move, node);
-		if (ino < entry->ino)
-			n = n->rb_left;
-		else if (ino > entry->ino)
-			n = n->rb_right;
-		else
-			return 1;
-	}
-	return 0;
+	return entry != NULL;
 }
 
 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2869,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 	if (!dm)
 		return -ENOMEM;
 	dm->ino = ino;
+	dm->rmdir_ino = 0;
 
 	while (*p) {
 		parent = *p;
@@ -2784,31 +2889,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 	return 0;
 }
 
-static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 {
 	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
 	struct waiting_dir_move *entry;
 
 	while (n) {
 		entry = rb_entry(n, struct waiting_dir_move, node);
-		if (ino < entry->ino) {
+		if (ino < entry->ino)
 			n = n->rb_left;
-		} else if (ino > entry->ino) {
+		else if (ino > entry->ino)
 			n = n->rb_right;
-		} else {
-			rb_erase(&entry->node, &sctx->waiting_dir_moves);
-			kfree(entry);
-			return 0;
-		}
+		else
+			return entry;
 	}
-	return -ENOENT;
+	return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+				  struct waiting_dir_move *dm)
+{
+	if (!dm)
+		return;
+	rb_erase(&dm->node, &sctx->waiting_dir_moves);
+	kfree(dm);
 }
 
-static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+static int add_pending_dir_move(struct send_ctx *sctx,
+				u64 ino,
+				u64 ino_gen,
+				u64 parent_ino)
 {
 	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
 	struct rb_node *parent = NULL;
-	struct pending_dir_move *entry, *pm;
+	struct pending_dir_move *entry = NULL, *pm;
 	struct recorded_ref *cur;
 	int exists = 0;
 	int ret;
@@ -2817,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
 	if (!pm)
 		return -ENOMEM;
 	pm->parent_ino = parent_ino;
-	pm->ino = sctx->cur_ino;
-	pm->gen = sctx->cur_inode_gen;
+	pm->ino = ino;
+	pm->gen = ino_gen;
 	INIT_LIST_HEAD(&pm->list);
 	INIT_LIST_HEAD(&pm->update_refs);
 	RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3003,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
 	struct fs_path *from_path = NULL;
 	struct fs_path *to_path = NULL;
+	struct fs_path *name = NULL;
 	u64 orig_progress = sctx->send_progress;
 	struct recorded_ref *cur;
+	u64 parent_ino, parent_gen;
+	struct waiting_dir_move *dm = NULL;
+	u64 rmdir_ino = 0;
 	int ret;
 
+	name = fs_path_alloc();
 	from_path = fs_path_alloc();
-	if (!from_path)
-		return -ENOMEM;
+	if (!name || !from_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-	sctx->send_progress = pm->ino;
-	ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+	dm = get_waiting_dir_move(sctx, pm->ino);
+	ASSERT(dm);
+	rmdir_ino = dm->rmdir_ino;
+	free_waiting_dir_move(sctx, dm);
+
+	ret = get_first_ref(sctx->parent_root, pm->ino,
+			    &parent_ino, &parent_gen, name);
 	if (ret < 0)
 		goto out;
 
+	if (parent_ino == sctx->cur_ino) {
+		/* child only renamed, not moved */
+		ASSERT(parent_gen == sctx->cur_inode_gen);
+		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				   from_path);
+		if (ret < 0)
+			goto out;
+		ret = fs_path_add_path(from_path, name);
+		if (ret < 0)
+			goto out;
+	} else {
+		/* child moved and maybe renamed too */
+		sctx->send_progress = pm->ino;
+		ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+		if (ret < 0)
+			goto out;
+	}
+
+	fs_path_free(name);
+	name = NULL;
+
 	to_path = fs_path_alloc();
 	if (!to_path) {
 		ret = -ENOMEM;
@@ -2908,9 +3056,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	}
 
 	sctx->send_progress = sctx->cur_ino + 1;
-	ret = del_waiting_dir_move(sctx, pm->ino);
-	ASSERT(ret == 0);
-
 	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
 	if (ret < 0)
 		goto out;
@@ -2919,6 +3064,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	if (ret < 0)
 		goto out;
 
+	if (rmdir_ino) {
+		struct orphan_dir_info *odi;
+
+		odi = get_orphan_dir_info(sctx, rmdir_ino);
+		if (!odi) {
+			/* already deleted */
+			goto finish;
+		}
+		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+		if (ret < 0)
+			goto out;
+		if (!ret)
+			goto finish;
+
+		name = fs_path_alloc();
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+		if (ret < 0)
+			goto out;
+		ret = send_rmdir(sctx, name);
+		if (ret < 0)
+			goto out;
+		free_orphan_dir_info(sctx, odi);
+	}
+
+finish:
 	ret = send_utimes(sctx, pm->ino, pm->gen);
 	if (ret < 0)
 		goto out;
@@ -2928,12 +3102,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	 * and old parent(s).
 	 */
 	list_for_each_entry(cur, &pm->update_refs, list) {
+		if (cur->dir == rmdir_ino)
+			continue;
 		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
 		if (ret < 0)
 			goto out;
 	}
 
 out:
+	fs_path_free(name);
 	fs_path_free(from_path);
 	fs_path_free(to_path);
 	sctx->send_progress = orig_progress;
@@ -3005,17 +3182,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	int ret;
 	u64 ino = parent_ref->dir;
 	u64 parent_ino_before, parent_ino_after;
-	u64 new_gen, old_gen;
+	u64 old_gen;
 	struct fs_path *path_before = NULL;
 	struct fs_path *path_after = NULL;
 	int len1, len2;
-
-	if (parent_ref->dir <= sctx->cur_ino)
-		return 0;
+	int register_upper_dirs;
+	u64 gen;
 
 	if (is_waiting_for_move(sctx, ino))
 		return 1;
 
+	if (parent_ref->dir <= sctx->cur_ino)
+		return 0;
+
 	ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
 			     NULL, NULL, NULL, NULL);
 	if (ret == -ENOENT)
@@ -3023,12 +3202,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	else if (ret < 0)
 		return ret;
 
-	ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
-			     NULL, NULL, NULL, NULL);
-	if (ret < 0)
-		return ret;
-
-	if (new_gen != old_gen)
+	if (parent_ref->dir_gen != old_gen)
 		return 0;
 
 	path_before = fs_path_alloc();
@@ -3051,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	}
 
 	ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
-			    NULL, path_after);
+			    &gen, path_after);
 	if (ret == -ENOENT) {
 		ret = 0;
 		goto out;
@@ -3061,13 +3235,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 
 	len1 = fs_path_len(path_before);
 	len2 = fs_path_len(path_after);
-	if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
-	     memcmp(path_before->start, path_after->start, len1))) {
+	if (parent_ino_before != parent_ino_after || len1 != len2 ||
+	     memcmp(path_before->start, path_after->start, len1)) {
 		ret = 1;
 		goto out;
 	}
 	ret = 0;
 
+	/*
+	 * Ok, our new most direct ancestor has a higher inode number but
+	 * wasn't moved/renamed. So maybe some of the new ancestors higher in
+	 * the hierarchy have an higher inode number too *and* were renamed
+	 * or moved - in this case we need to wait for the ancestor's rename
+	 * or move operation before we can do the move/rename for the current
+	 * inode.
+	 */
+	register_upper_dirs = 0;
+	ino = parent_ino_after;
+again:
+	while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
+		u64 parent_gen;
+
+		fs_path_reset(path_before);
+		fs_path_reset(path_after);
+
+		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+				    &parent_gen, path_after);
+		if (ret < 0)
+			goto out;
+		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+				    NULL, path_before);
+		if (ret == -ENOENT) {
+			ret = 0;
+			break;
+		} else if (ret < 0) {
+			goto out;
+		}
+
+		len1 = fs_path_len(path_before);
+		len2 = fs_path_len(path_after);
+		if (parent_ino_before != parent_ino_after || len1 != len2 ||
+		    memcmp(path_before->start, path_after->start, len1)) {
+			ret = 1;
+			if (register_upper_dirs) {
+				break;
+			} else {
+				register_upper_dirs = 1;
+				ino = parent_ref->dir;
+				gen = parent_ref->dir_gen;
+				goto again;
+			}
+		} else if (register_upper_dirs) {
+			ret = add_pending_dir_move(sctx, ino, gen,
+						   parent_ino_after);
+			if (ret < 0 && ret != -EEXIST)
+				goto out;
+		}
+
+		ino = parent_ino_after;
+		gen = parent_gen;
+	}
+
 out:
 	fs_path_free(path_before);
 	fs_path_free(path_after);
@@ -3089,6 +3317,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 	u64 ow_gen;
 	int did_overwrite = 0;
 	int is_orphan = 0;
+	u64 last_dir_ino_rm = 0;
 
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
@@ -3227,9 +3456,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				 * dirs, we always have one new and one deleted
 				 * ref. The deleted ref is ignored later.
 				 */
-				if (wait_for_parent_move(sctx, cur)) {
+				ret = wait_for_parent_move(sctx, cur);
+				if (ret < 0)
+					goto out;
+				if (ret) {
 					ret = add_pending_dir_move(sctx,
-								   cur->dir);
+							   sctx->cur_ino,
+							   sctx->cur_inode_gen,
+							   cur->dir);
 					*pending_move = 1;
 				} else {
 					ret = send_rename(sctx, valid_path,
@@ -3259,7 +3493,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * later, we do this check again and rmdir it then if possible.
 		 * See the use of check_dirs for more details.
 		 */
-		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				sctx->cur_ino);
 		if (ret < 0)
 			goto out;
 		if (ret) {
@@ -3350,8 +3585,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
 			if (ret < 0)
 				goto out;
-		} else if (ret == inode_state_did_delete) {
-			ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+		} else if (ret == inode_state_did_delete &&
+			   cur->dir != last_dir_ino_rm) {
+			ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+					sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 			if (ret) {
@@ -3362,6 +3599,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				ret = send_rmdir(sctx, valid_path);
 				if (ret < 0)
 					goto out;
+				last_dir_ino_rm = cur->dir;
 			}
 		}
 	}
@@ -3375,9 +3613,8 @@ out:
 	return ret;
 }
 
-static int __record_new_ref(int num, u64 dir, int index,
-			    struct fs_path *name,
-			    void *ctx)
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+		      struct fs_path *name, void *ctx, struct list_head *refs)
 {
 	int ret = 0;
 	struct send_ctx *sctx = ctx;
@@ -3388,7 +3625,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 	if (!p)
 		return -ENOMEM;
 
-	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+	ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
 			NULL, NULL);
 	if (ret < 0)
 		goto out;
@@ -3400,7 +3637,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 	if (ret < 0)
 		goto out;
 
-	ret = record_ref(&sctx->new_refs, dir, gen, p);
+	ret = __record_ref(refs, dir, gen, p);
 
 out:
 	if (ret)
@@ -3408,37 +3645,23 @@ out:
 	return ret;
 }
 
+static int __record_new_ref(int num, u64 dir, int index,
+			    struct fs_path *name,
+			    void *ctx)
+{
+	struct send_ctx *sctx = ctx;
+	return record_ref(sctx->send_root, num, dir, index, name,
+			  ctx, &sctx->new_refs);
+}
+
+
 static int __record_deleted_ref(int num, u64 dir, int index,
 				struct fs_path *name,
 				void *ctx)
 {
-	int ret = 0;
 	struct send_ctx *sctx = ctx;
-	struct fs_path *p;
-	u64 gen;
-
-	p = fs_path_alloc();
-	if (!p)
-		return -ENOMEM;
-
-	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-			NULL, NULL);
-	if (ret < 0)
-		goto out;
-
-	ret = get_cur_path(sctx, dir, gen, p);
-	if (ret < 0)
-		goto out;
-	ret = fs_path_add_path(p, name);
-	if (ret < 0)
-		goto out;
-
-	ret = record_ref(&sctx->deleted_refs, dir, gen, p);
-
-out:
-	if (ret)
-		fs_path_free(p);
-	return ret;
+	return record_ref(sctx->parent_root, num, dir, index, name,
+			  ctx, &sctx->deleted_refs);
 }
 
 static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3842,31 @@ static int process_all_refs(struct send_ctx *sctx,
 		root = sctx->parent_root;
 		cb = __record_deleted_ref;
 	} else {
-		BUG();
+		btrfs_err(sctx->send_root->fs_info,
+				"Wrong command %d in process_all_refs", cmd);
+		ret = -EINVAL;
+		goto out;
 	}
 
 	key.objectid = sctx->cmp_key->objectid;
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = 0;
-	while (1) {
-		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0)
-			goto out;
-		if (ret)
-			break;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
 
+	while (1) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
 
 		if (found_key.objectid != key.objectid ||
@@ -3642,11 +3875,10 @@ static int process_all_refs(struct send_ctx *sctx,
 			break;
 
 		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
-		btrfs_release_path(path);
 		if (ret < 0)
 			goto out;
 
-		key.offset = found_key.offset + 1;
+		path->slots[0]++;
 	}
 	btrfs_release_path(path);
 
@@ -3927,19 +4159,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 	key.objectid = sctx->cmp_key->objectid;
 	key.type = BTRFS_XATTR_ITEM_KEY;
 	key.offset = 0;
-	while (1) {
-		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0)
-			goto out;
-		if (ret) {
-			ret = 0;
-			goto out;
-		}
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
 
+	while (1) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
-		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
 
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
 		if (found_key.objectid != key.objectid ||
 		    found_key.type != key.type) {
 			ret = 0;
@@ -3951,8 +4189,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 		if (ret < 0)
 			goto out;
 
-		btrfs_release_path(path);
-		key.offset = found_key.offset + 1;
+		path->slots[0]++;
 	}
 
 out:
@@ -3991,6 +4228,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 		goto out;
 
 	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+	/* initial readahead */
+	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+	file_ra_state_init(&sctx->ra, inode->i_mapping);
+	btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+		       last_index - index + 1);
+
 	while (index <= last_index) {
 		unsigned cur_len = min_t(unsigned, len,
 					 PAGE_CACHE_SIZE - pg_offset);
@@ -4763,18 +5007,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 		ret = apply_children_dir_moves(sctx);
 		if (ret)
 			goto out;
+		/*
+		 * Need to send that every time, no matter if it actually
+		 * changed between the two trees as we have done changes to
+		 * the inode before. If our inode is a directory and it's
+		 * waiting to be moved/renamed, we will send its utimes when
+		 * it's moved/renamed, therefore we don't need to do it here.
+		 */
+		sctx->send_progress = sctx->cur_ino + 1;
+		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+		if (ret < 0)
+			goto out;
 	}
 
-	/*
-	 * Need to send that every time, no matter if it actually
-	 * changed between the two trees as we have done changes to
-	 * the inode before.
-	 */
-	sctx->send_progress = sctx->cur_ino + 1;
-	ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
-	if (ret < 0)
-		goto out;
-
 out:
 	return ret;
 }
@@ -4840,6 +5085,8 @@ static int changed_inode(struct send_ctx *sctx,
 				sctx->left_path->nodes[0], left_ii);
 		sctx->cur_inode_mode = btrfs_inode_mode(
 				sctx->left_path->nodes[0], left_ii);
+		sctx->cur_inode_rdev = btrfs_inode_rdev(
+				sctx->left_path->nodes[0], left_ii);
 		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
 			ret = send_create_inode_if_needed(sctx);
 	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5131,8 @@ static int changed_inode(struct send_ctx *sctx,
 					sctx->left_path->nodes[0], left_ii);
 			sctx->cur_inode_mode = btrfs_inode_mode(
 					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_rdev = btrfs_inode_rdev(
+					sctx->left_path->nodes[0], left_ii);
 			ret = send_create_inode_if_needed(sctx);
 			if (ret < 0)
 				goto out;
@@ -5118,6 +5367,7 @@ out:
 static int full_send_tree(struct send_ctx *sctx)
 {
 	int ret;
+	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_root *send_root = sctx->send_root;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -5139,6 +5389,19 @@ static int full_send_tree(struct send_ctx *sctx)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
+join_trans:
+	/*
+	 * We need to make sure the transaction does not get committed
+	 * while we do anything on commit roots. Join a transaction to prevent
+	 * this.
+	 */
+	trans = btrfs_join_transaction(send_root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
+
 	/*
 	 * Make sure the tree has not changed after re-joining. We detect this
 	 * by comparing start_ctransid and ctransid. They should always match.
@@ -5162,6 +5425,19 @@ static int full_send_tree(struct send_ctx *sctx)
 		goto out_finish;
 
 	while (1) {
+		/*
+		 * When someone want to commit while we iterate, end the
+		 * joined transaction and rejoin.
+		 */
+		if (btrfs_should_end_transaction(trans, send_root)) {
+			ret = btrfs_end_transaction(trans, send_root);
+			trans = NULL;
+			if (ret < 0)
+				goto out;
+			btrfs_release_path(path);
+			goto join_trans;
+		}
+
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5189,6 +5465,12 @@ out_finish:
 
 out:
 	btrfs_free_path(path);
+	if (trans) {
+		if (!ret)
+			ret = btrfs_end_transaction(trans, send_root);
+		else
+			btrfs_end_transaction(trans, send_root);
+	}
 	return ret;
 }
 
@@ -5340,6 +5622,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 
 	sctx->pending_dir_moves = RB_ROOT;
 	sctx->waiting_dir_moves = RB_ROOT;
+	sctx->orphan_dirs = RB_ROOT;
 
 	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
 			(arg->clone_sources_count + 1));
@@ -5477,6 +5760,16 @@ out:
 		kfree(dm);
 	}
 
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+		struct rb_node *n;
+		struct orphan_dir_info *odi;
+
+		n = rb_first(&sctx->orphan_dirs);
+		odi = rb_entry(n, struct orphan_dir_info, node);
+		free_orphan_dir_info(sctx, odi);
+	}
+
 	if (sort_clone_roots) {
 		for (i = 0; i < sctx->clone_roots_cnt; i++)
 			btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..d4878ddba87a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1305,13 +1305,6 @@ error_fs_info:
 	return ERR_PTR(error);
 }
 
-static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
-{
-	spin_lock_irq(&workers->lock);
-	workers->max_workers = new_limit;
-	spin_unlock_irq(&workers->lock);
-}
-
 static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 				     int new_pool_size, int old_pool_size)
 {
@@ -1323,21 +1316,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_info(fs_info, "resize thread pool %d -> %d",
 	       old_pool_size, new_pool_size);
 
-	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
-	btrfs_set_max_workers(&fs_info->workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
-	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
-			      new_pool_size);
+	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+				new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+				new_pool_size);
 }
 
 static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1479,6 +1471,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		sb->s_flags &= ~MS_RDONLY;
 	}
 out:
+	wake_up_process(fs_info->transaction_kthread);
 	btrfs_remount_cleanup(fs_info, old_opts);
 	return 0;
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
 #include <linux/kobject.h>
 #include <linux/bug.h>
 #include <linux/genhd.h>
+#include <linux/debugfs.h>
 
 #include "ctree.h"
 #include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
 
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
 int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 {
 	int error;
@@ -642,27 +649,41 @@ failure:
 	return error;
 }
 
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+	if (!btrfs_debugfs_root_dentry)
+		return -ENOMEM;
+
+	debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+			&btrfs_debugfs_test);
+#endif
+	return 0;
+}
+
 int btrfs_init_sysfs(void)
 {
 	int ret;
+
 	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
 	if (!btrfs_kset)
 		return -ENOMEM;
 
-	init_feature_attrs();
+	ret = btrfs_init_debugfs();
+	if (ret)
+		return ret;
 
+	init_feature_attrs();
 	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
-	if (ret) {
-		kset_unregister(btrfs_kset);
-		return ret;
-	}
 
-	return 0;
+	return ret;
 }
 
 void btrfs_exit_sysfs(void)
 {
 	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	kset_unregister(btrfs_kset);
+	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
 }
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
 #ifndef _BTRFS_SYSFS_H_
 #define _BTRFS_SYSFS_H_
 
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
 enum btrfs_feature_set {
 	FEAT_COMPAT,
 	FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..a04707f740d6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	int lock = (trans->type != TRANS_JOIN_NOLOCK);
 	int err = 0;
 
-	if (--trans->use_count) {
+	if (trans->use_count > 1) {
+		trans->use_count--;
 		trans->block_rsv = trans->orig_rsv;
 		return 0;
 	}
@@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
-		if (throttle) {
-			/*
-			 * We may race with somebody else here so end up having
-			 * to call end_transaction on ourselves again, so inc
-			 * our use_count.
-			 */
-			trans->use_count++;
+		if (throttle)
 			return btrfs_commit_transaction(trans, root);
-		} else {
+		else
 			wake_up_process(info->transaction_kthread);
-		}
 	}
 
 	if (trans->type & __TRANS_FREEZABLE)
@@ -1578,10 +1572,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
 	trace_btrfs_transaction_commit(root);
 
-	btrfs_scrub_continue(root);
-
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
+	btrfs_scrub_cancel(root->fs_info);
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
@@ -1621,7 +1614,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
 	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-		return btrfs_start_delalloc_roots(fs_info, 1);
+		return btrfs_start_delalloc_roots(fs_info, 1, -1);
 	return 0;
 }
 
@@ -1754,7 +1747,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	/* ->aborted might be set after the previous check, so check it */
 	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
 		ret = cur_trans->aborted;
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 	/*
 	 * the reloc mutex makes sure that we stop
@@ -1771,7 +1764,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = create_pending_snapshots(trans, root->fs_info);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1787,13 +1780,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_items(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1823,7 +1816,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1844,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1855,7 +1848,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		ret = cur_trans->aborted;
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	btrfs_prepare_extent_commit(trans, root);
@@ -1891,13 +1884,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		btrfs_error(root->fs_info, ret,
 			    "Error while writing out transaction");
 		mutex_unlock(&root->fs_info->tree_log_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	ret = write_ctree_super(trans, root, 0);
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1940,6 +1933,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	return ret;
 
+scrub_continue:
+	btrfs_scrub_continue(root);
 cleanup_transaction:
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
  * syncing the tree wait for us to finish
  */
 static int start_log_trans(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root)
+			   struct btrfs_root *root,
+			   struct btrfs_log_ctx *ctx)
 {
+	int index;
 	int ret;
-	int err = 0;
 
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
+		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+		    trans->transid) {
+			ret = -EAGAIN;
+			goto out;
+		}
+
 		if (!root->log_start_pid) {
 			root->log_start_pid = current->pid;
 			root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 
 		atomic_inc(&root->log_batch);
 		atomic_inc(&root->log_writers);
+		if (ctx) {
+			index = root->log_transid % 2;
+			list_add_tail(&ctx->list, &root->log_ctxs[index]);
+			ctx->log_transid = root->log_transid;
+		}
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
-	root->log_multiple_pids = false;
-	root->log_start_pid = current->pid;
+
+	ret = 0;
 	mutex_lock(&root->fs_info->tree_log_mutex);
-	if (!root->fs_info->log_root_tree) {
+	if (!root->fs_info->log_root_tree)
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
-		if (ret)
-			err = ret;
-	}
-	if (err == 0 && !root->log_root) {
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	if (ret)
+		goto out;
+
+	if (!root->log_root) {
 		ret = btrfs_add_log_tree(trans, root);
 		if (ret)
-			err = ret;
+			goto out;
 	}
-	mutex_unlock(&root->fs_info->tree_log_mutex);
+	root->log_multiple_pids = false;
+	root->log_start_pid = current->pid;
 	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
+	if (ctx) {
+		index = root->log_transid % 2;
+		list_add_tail(&ctx->list, &root->log_ctxs[index]);
+		ctx->log_transid = root->log_transid;
+	}
+out:
 	mutex_unlock(&root->log_mutex);
-	return err;
+	return ret;
 }
 
 /*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, unsigned long transid)
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, int transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
 
-		if (root->fs_info->last_trans_log_full_commit !=
-		    trans->transid && root->log_transid < transid + 2 &&
+		if (root->log_transid_committed < transid &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (root->fs_info->last_trans_log_full_commit !=
-		 trans->transid && root->log_transid < transid + 2 &&
+	} while (root->log_transid_committed < transid &&
 		 atomic_read(&root->log_commit[index]));
-	return 0;
 }
 
 static void wait_for_writer(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
-	while (root->fs_info->last_trans_log_full_commit !=
-	       trans->transid && atomic_read(&root->log_writers)) {
+
+	while (atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (root->fs_info->last_trans_log_full_commit !=
-		    trans->transid && atomic_read(&root->log_writers))
+		if (atomic_read(&root->log_writers))
 			schedule();
 		mutex_lock(&root->log_mutex);
 		finish_wait(&root->log_writer_wait, &wait);
 	}
 }
 
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+					struct btrfs_log_ctx *ctx)
+{
+	if (!ctx)
+		return;
+
+	mutex_lock(&root->log_mutex);
+	list_del_init(&ctx->list);
+	mutex_unlock(&root->log_mutex);
+}
+
+/* 
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+					     int index, int error)
+{
+	struct btrfs_log_ctx *ctx;
+
+	if (!error) {
+		INIT_LIST_HEAD(&root->log_ctxs[index]);
+		return;
+	}
+
+	list_for_each_entry(ctx, &root->log_ctxs[index], list)
+		ctx->log_ret = error;
+
+	INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
 /*
  * btrfs_sync_log does sends a given tree log down to the disk and
  * updates the super blocks to record it.  When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
  * that has happened.
  */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-		   struct btrfs_root *root)
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
 {
 	int index1;
 	int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
-	unsigned long log_transid = 0;
+	int log_transid = 0;
+	struct btrfs_log_ctx root_log_ctx;
 	struct blk_plug plug;
 
 	mutex_lock(&root->log_mutex);
-	log_transid = root->log_transid;
-	index1 = root->log_transid % 2;
+	log_transid = ctx->log_transid;
+	if (root->log_transid_committed >= log_transid) {
+		mutex_unlock(&root->log_mutex);
+		return ctx->log_ret;
+	}
+
+	index1 = log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(trans, root, root->log_transid);
+		wait_log_commit(trans, root, log_transid);
 		mutex_unlock(&root->log_mutex);
-		return 0;
+		return ctx->log_ret;
 	}
+	ASSERT(log_transid == root->log_transid);
 	atomic_set(&root->log_commit[index1], 1);
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-		wait_log_commit(trans, root, root->log_transid - 1);
+		wait_log_commit(trans, root, log_transid - 1);
+
 	while (1) {
 		int batch = atomic_read(&root->log_batch);
 		/* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	/* bail out if we need to do a full commit */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+	if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+	    trans->transid) {
 		ret = -EAGAIN;
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		blk_finish_plug(&plug);
 		btrfs_abort_transaction(trans, root, ret);
 		btrfs_free_logged_extents(log, log_transid);
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
-	smp_mb();
 	/*
 	 * IO has been started, blocks of the log tree have WRITTEN flag set
 	 * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	mutex_unlock(&root->log_mutex);
 
+	btrfs_init_log_ctx(&root_log_ctx);
+
 	mutex_lock(&log_root_tree->log_mutex);
 	atomic_inc(&log_root_tree->log_batch);
 	atomic_inc(&log_root_tree->log_writers);
+
+	index2 = log_root_tree->log_transid % 2;
+	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+	root_log_ctx.log_transid = log_root_tree->log_transid;
+
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	if (ret) {
+		if (!list_empty(&root_log_ctx.list))
+			list_del_init(&root_log_ctx.list);
+
 		blk_finish_plug(&plug);
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		if (ret != -ENOSPC) {
 			btrfs_abort_transaction(trans, root, ret);
 			mutex_unlock(&log_root_tree->log_mutex);
 			goto out;
 		}
-		root->fs_info->last_trans_log_full_commit = trans->transid;
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	index2 = log_root_tree->log_transid % 2;
+	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = root_log_ctx.log_ret;
+		goto out;
+	}
+
+	index2 = root_log_ctx.log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid);
+				root_log_ctx.log_transid);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
-		ret = 0;
+		ret = root_log_ctx.log_ret;
 		goto out;
 	}
+	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
 	atomic_set(&log_root_tree->log_commit[index2], 1);
 
 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
 		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid - 1);
+				root_log_ctx.log_transid - 1);
 	}
 
 	wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * now that we've moved on to the tree of log tree roots,
 	 * check the full commit flag again
 	 */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+	if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+	    trans->transid) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 					 EXTENT_DIRTY | EXTENT_NEW);
 	blk_finish_plug(&plug);
 	if (ret) {
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		btrfs_abort_transaction(trans, root, ret);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 				btrfs_header_level(log_root_tree->node));
 
 	log_root_tree->log_transid++;
-	smp_mb();
-
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	/*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
 	if (ret) {
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_wake_log_root;
 	}
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->log_mutex);
 
 out_wake_log_root:
+	/*
+	 * We needn't get log_mutex here because we are sure all
+	 * the other tasks are blocked.
+	 */
+	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_transid_committed++;
 	atomic_set(&log_root_tree->log_commit[index2], 0);
-	smp_mb();
+	mutex_unlock(&log_root_tree->log_mutex);
+
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
 		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
+	/* See above. */
+	btrfs_remove_all_log_ctxs(root, index1, ret);
+
+	mutex_lock(&root->log_mutex);
+	root->log_transid_committed++;
 	atomic_set(&root->log_commit[index1], 0);
-	smp_mb();
+	mutex_unlock(&root->log_mutex);
+
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
 	return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
 
 static int log_one_extent(struct btrfs_trans_handle *trans,
 			  struct inode *inode, struct btrfs_root *root,
-			  struct extent_map *em, struct btrfs_path *path)
+			  struct extent_map *em, struct btrfs_path *path,
+			  struct list_head *logged_list)
 {
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	u64 extent_offset = em->start - em->orig_start;
 	u64 block_len;
 	int ret;
-	int index = log->log_transid % 2;
 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 	int extent_inserted = 0;
 
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	 * First check and see if our csums are on our outstanding ordered
 	 * extents.
 	 */
-again:
-	spin_lock_irq(&log->log_extents_lock[index]);
-	list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+	list_for_each_entry(ordered, logged_list, log_list) {
 		struct btrfs_ordered_sum *sum;
 
 		if (!mod_len)
 			break;
 
-		if (ordered->inode != inode)
-			continue;
-
 		if (ordered->file_offset + ordered->len <= mod_start ||
 		    mod_start + mod_len <= ordered->file_offset)
 			continue;
@@ -3632,12 +3720,6 @@ again:
 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
 				     &ordered->flags))
 			continue;
-		atomic_inc(&ordered->refs);
-		spin_unlock_irq(&log->log_extents_lock[index]);
-		/*
-		 * we've dropped the lock, we must either break or
-		 * start over after this.
-		 */
 
 		if (ordered->csum_bytes_left) {
 			btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
 
 		list_for_each_entry(sum, &ordered->list, list) {
 			ret = btrfs_csum_file_blocks(trans, log, sum);
-			if (ret) {
-				btrfs_put_ordered_extent(ordered);
+			if (ret)
 				goto unlocked;
-			}
 		}
-		btrfs_put_ordered_extent(ordered);
-		goto again;
 
 	}
-	spin_unlock_irq(&log->log_extents_lock[index]);
 unlocked:
 
 	if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *inode,
-				     struct btrfs_path *path)
+				     struct btrfs_path *path,
+				     struct list_head *logged_list)
 {
 	struct extent_map *em, *n;
 	struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
 
 		write_unlock(&tree->lock);
 
-		ret = log_one_extent(trans, inode, root, em, path);
+		ret = log_one_extent(trans, inode, root, em, path, logged_list);
 		write_lock(&tree->lock);
 		clear_em_logging(tree, em);
 		free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
+	LIST_HEAD(logged_list);
 	u64 last_extent = 0;
 	int err = 0;
 	int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
-	btrfs_get_logged_extents(log, inode);
+	btrfs_get_logged_extents(inode, &logged_list);
 
 	/*
 	 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
 	if (fast_search) {
-		ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+						&logged_list);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
 	BTRFS_I(inode)->logged_trans = trans->transid;
 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
-	if (err)
-		btrfs_free_logged_extents(log, log->log_transid);
+	if (unlikely(err))
+		btrfs_put_logged_extents(&logged_list);
+	else
+		btrfs_submit_logged_extents(&logged_list, log);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
  */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 			    	  struct btrfs_root *root, struct inode *inode,
-			    	  struct dentry *parent, int exists_only)
+			    	  struct dentry *parent, int exists_only,
+				  struct btrfs_log_ctx *ctx)
 {
 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_no_trans;
 	}
 
-	ret = start_log_trans(trans, root);
+	ret = start_log_trans(trans, root, ctx);
 	if (ret)
-		goto end_trans;
+		goto end_no_trans;
 
 	ret = btrfs_log_inode(trans, root, inode, inode_only);
 	if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 1;
 	}
+
+	if (ret)
+		btrfs_remove_log_ctx(root, ctx);
 	btrfs_end_log_trans(root);
 end_no_trans:
 	return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
  * data on disk.
  */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry)
+			  struct btrfs_root *root, struct dentry *dentry,
+			  struct btrfs_log_ctx *ctx)
 {
 	struct dentry *parent = dget_parent(dentry);
 	int ret;
 
-	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+				     0, ctx);
 	dput(parent);
 
 	return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		    root->fs_info->last_trans_committed))
 		return 0;
 
-	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+	return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
 }
 
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
 
+struct btrfs_log_ctx {
+	int log_ret;
+	int log_transid;
+	struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+	ctx->log_ret = 0;
+	ctx->log_transid = 0;
+	INIT_LIST_HEAD(&ctx->list);
+}
+
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-		   struct btrfs_root *root);
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry);
+			  struct btrfs_root *root, struct dentry *dentry,
+			  struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..d241130a32fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
 			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
-			btrfs_requeue_work(&device->work);
+			btrfs_queue_work(fs_info->submit_workers,
+					 &device->work);
 			goto done;
 		}
 		/* unplug every 64 requests just for good measure */
@@ -5263,6 +5264,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 static void btrfs_end_bio(struct bio *bio, int err)
 {
 	struct btrfs_bio *bbio = bio->bi_private;
+	struct btrfs_device *dev = bbio->stripes[0].dev;
 	int is_orig_bio = 0;
 
 	if (err) {
@@ -5270,7 +5272,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		if (err == -EIO || err == -EREMOTEIO) {
 			unsigned int stripe_index =
 				btrfs_io_bio(bio)->stripe_index;
-			struct btrfs_device *dev;
 
 			BUG_ON(stripe_index >= bbio->num_stripes);
 			dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5293,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
 	if (bio == bbio->orig_bio)
 		is_orig_bio = 1;
 
+	btrfs_bio_counter_dec(bbio->fs_info);
+
 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
 		if (!is_orig_bio) {
 			bio_put(bio);
@@ -5328,13 +5331,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 	}
 }
 
-struct async_sched {
-	struct bio *bio;
-	int rw;
-	struct btrfs_fs_info *info;
-	struct btrfs_work work;
-};
-
 /*
  * see run_scheduled_bios for a description of why bios are collected for
  * async submit.
@@ -5391,8 +5387,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 	spin_unlock(&device->io_lock);
 
 	if (should_queue)
-		btrfs_queue_worker(&root->fs_info->submit_workers,
-				   &device->work);
+		btrfs_queue_work(root->fs_info->submit_workers,
+				 &device->work);
 }
 
 static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5443,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 	}
 #endif
 	bio->bi_bdev = dev->bdev;
+
+	btrfs_bio_counter_inc_noblocked(root->fs_info);
+
 	if (async)
 		btrfs_schedule_bio(root, dev, rw, bio);
 	else
@@ -5515,28 +5514,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	length = bio->bi_iter.bi_size;
 	map_length = length;
 
+	btrfs_bio_counter_inc_blocked(root->fs_info);
 	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
 			      mirror_num, &raid_map);
-	if (ret) /* -ENOMEM */
+	if (ret) {
+		btrfs_bio_counter_dec(root->fs_info);
 		return ret;
+	}
 
 	total_devs = bbio->num_stripes;
 	bbio->orig_bio = first_bio;
 	bbio->private = first_bio->bi_private;
 	bbio->end_io = first_bio->bi_end_io;
+	bbio->fs_info = root->fs_info;
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
 	if (raid_map) {
 		/* In this case, map_length has been set to the length of
 		   a single stripe; not the whole write */
 		if (rw & WRITE) {
-			return raid56_parity_write(root, bio, bbio,
-						   raid_map, map_length);
+			ret = raid56_parity_write(root, bio, bbio,
+						  raid_map, map_length);
 		} else {
-			return raid56_parity_recover(root, bio, bbio,
-						     raid_map, map_length,
-						     mirror_num);
+			ret = raid56_parity_recover(root, bio, bbio,
+						    raid_map, map_length,
+						    mirror_num);
 		}
+		/*
+		 * FIXME, replace dosen't support raid56 yet, please fix
+		 * it in the future.
+		 */
+		btrfs_bio_counter_dec(root->fs_info);
+		return ret;
 	}
 
 	if (map_length < length) {
@@ -5578,6 +5587,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 				  async_submit);
 		dev_nr++;
 	}
+	btrfs_bio_counter_dec(root->fs_info);
 	return 0;
 }
 
@@ -5666,7 +5676,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	else
 		generate_random_uuid(dev->uuid);
 
-	dev->work.func = pending_bios_fn;
+	btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
 
 	return dev;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
 struct btrfs_bio {
 	atomic_t stripes_pending;
+	struct btrfs_fs_info *fs_info;
 	bio_end_io_t *end_io;
 	struct bio *orig_bio;
 	void *private;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 3176cdc32937..4ee4e30d26d9 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -21,6 +21,8 @@ struct btrfs_block_group_cache;
 struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
+struct btrfs_work;
+struct __btrfs_workqueue;
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -982,6 +984,141 @@ TRACE_EVENT(free_extent_state,
 		  (void *)__entry->ip)
 );
 
+DECLARE_EVENT_CLASS(btrfs__work,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(	void *,	work			)
+		__field(	void *, wq			)
+		__field(	void *,	func			)
+		__field(	void *,	ordered_func		)
+		__field(	void *,	ordered_free		)
+	),
+
+	TP_fast_assign(
+		__entry->work		= work;
+		__entry->wq		= work->wq;
+		__entry->func		= work->func;
+		__entry->ordered_func	= work->ordered_func;
+		__entry->ordered_free	= work->ordered_free;
+	),
+
+	TP_printk("work=%p, wq=%p, func=%p, ordered_func=%p, ordered_free=%p",
+		  __entry->work, __entry->wq, __entry->func,
+		  __entry->ordered_func, __entry->ordered_free)
+);
+
+/* For situiations that the work is freed */
+DECLARE_EVENT_CLASS(btrfs__work__done,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(	void *,	work			)
+	),
+
+	TP_fast_assign(
+		__entry->work		= work;
+	),
+
+	TP_printk("work->%p", __entry->work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_work_queued,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_work_sched,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_normal_work_done,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DECLARE_EVENT_CLASS(btrfs__workqueue,
+
+	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+
+	TP_ARGS(wq, name, high),
+
+	TP_STRUCT__entry(
+		__field(	void *,	wq			)
+		__string(	name,	name			)
+		__field(	int ,	high			)
+	),
+
+	TP_fast_assign(
+		__entry->wq		= wq;
+		__assign_str(name, name);
+		__entry->high		= high;
+	),
+
+	TP_printk("name=%s%s, wq=%p", __get_str(name),
+		  __print_flags(__entry->high, "",
+				{(WQ_HIGHPRI),	"-high"}),
+		  __entry->wq)
+);
+
+DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc,
+
+	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+
+	TP_ARGS(wq, name, high)
+);
+
+DECLARE_EVENT_CLASS(btrfs__workqueue_done,
+
+	TP_PROTO(struct __btrfs_workqueue *wq),
+
+	TP_ARGS(wq),
+
+	TP_STRUCT__entry(
+		__field(	void *,	wq			)
+	),
+
+	TP_fast_assign(
+		__entry->wq		= wq;
+	),
+
+	TP_printk("wq=%p", __entry->wq)
+);
+
+DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
+
+	TP_PROTO(struct __btrfs_workqueue *wq),
+
+	TP_ARGS(wq)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-04 15:31:36 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-04-04 15:31:36 -0700
commit	53c566625fb872e7826a237f0f5c21458028e94a (patch)
tree	8ef9990ed2124f085442bc5a44c3f5212bf4002d
parent	34917f9713905a937816ebb7ee5f25bef7a6441c (diff)
parent	00fdf13a2e9f313a044288aa59d3b8ec29ff904a (diff)