From 8811b5968f6216e97ccb9fe7b9883af39e339921 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Thu, 2 Aug 2012 08:33:00 +1000
Subject: raid5: make_request use batch stripe release

make_request() does stripe release for every stripe and the stripe usually has
count 1, which makes previous release_stripe() optimization not work. In my
test, this release_stripe() becomes the heaviest pleace to take
conf->device_lock after previous patches applied.

Below patch makes stripe release batch. All the stripes will be released in
unplug. The STRIPE_ON_UNPLUG_LIST bit is to protect concurrent access stripe
lru.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 drivers/md/raid5.h |  1 +
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bde9da2baa3..978ba9b7a3c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -471,7 +471,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 		} else {
 			if (atomic_read(&sh->count)) {
 				BUG_ON(!list_empty(&sh->lru)
-				    && !test_bit(STRIPE_EXPANDING, &sh->state));
+				    && !test_bit(STRIPE_EXPANDING, &sh->state)
+				    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
@@ -3988,6 +3989,62 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
 	return sh;
 }
 
+struct raid5_plug_cb {
+	struct blk_plug_cb	cb;
+	struct list_head	list;
+};
+
+static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
+{
+	struct raid5_plug_cb *cb = container_of(
+		blk_cb, struct raid5_plug_cb, cb);
+	struct stripe_head *sh;
+	struct mddev *mddev = cb->cb.data;
+	struct r5conf *conf = mddev->private;
+
+	if (cb->list.next && !list_empty(&cb->list)) {
+		spin_lock_irq(&conf->device_lock);
+		while (!list_empty(&cb->list)) {
+			sh = list_first_entry(&cb->list, struct stripe_head, lru);
+			list_del_init(&sh->lru);
+			/*
+			 * avoid race release_stripe_plug() sees
+			 * STRIPE_ON_UNPLUG_LIST clear but the stripe
+			 * is still in our list
+			 */
+			smp_mb__before_clear_bit();
+			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
+			__release_stripe(conf, sh);
+		}
+		spin_unlock_irq(&conf->device_lock);
+	}
+	kfree(cb);
+}
+
+static void release_stripe_plug(struct mddev *mddev,
+				struct stripe_head *sh)
+{
+	struct blk_plug_cb *blk_cb = blk_check_plugged(
+		raid5_unplug, mddev,
+		sizeof(struct raid5_plug_cb));
+	struct raid5_plug_cb *cb;
+
+	if (!blk_cb) {
+		release_stripe(sh);
+		return;
+	}
+
+	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
+
+	if (cb->list.next == NULL)
+		INIT_LIST_HEAD(&cb->list);
+
+	if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
+		list_add_tail(&sh->lru, &cb->list);
+	else
+		release_stripe(sh);
+}
+
 static void make_request(struct mddev *mddev, struct bio * bi)
 {
 	struct r5conf *conf = mddev->private;
@@ -4116,8 +4173,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			if ((bi->bi_rw & REQ_SYNC) &&
 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
-			mddev_check_plugged(mddev);
-			release_stripe(sh);
+			release_stripe_plug(mddev, sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2164021f3b5..9a7b36f0a42 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -319,6 +319,7 @@ enum {
 	STRIPE_BIOFILL_RUN,
 	STRIPE_COMPUTE_RUN,
 	STRIPE_OPS_REQ_PENDING,
+	STRIPE_ON_UNPLUG_LIST,
 };
 
 /*
-- 
cgit v1.2.3


From 46a06401f6ba13e59d24746fa9ffa6773b69eee3 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Thu, 2 Aug 2012 08:33:15 +1000
Subject: raid5: raid5d handle stripe in batch way

Let raid5d handle stripe in batch way to reduce conf->device_lock locking.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 978ba9b7a3c..9e41ae37bd4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4574,6 +4574,30 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	return handled;
 }
 
+#define MAX_STRIPE_BATCH 8
+static int handle_active_stripes(struct r5conf *conf)
+{
+	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
+	int i, batch_size = 0;
+
+	while (batch_size < MAX_STRIPE_BATCH &&
+			(sh = __get_priority_stripe(conf)) != NULL)
+		batch[batch_size++] = sh;
+
+	if (batch_size == 0)
+		return batch_size;
+	spin_unlock_irq(&conf->device_lock);
+
+	for (i = 0; i < batch_size; i++)
+		handle_stripe(batch[i]);
+
+	cond_resched();
+
+	spin_lock_irq(&conf->device_lock);
+	for (i = 0; i < batch_size; i++)
+		__release_stripe(conf, batch[i]);
+	return batch_size;
+}
 
 /*
  * This is our raid5 kernel thread.
@@ -4584,7 +4608,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
  */
 static void raid5d(struct mddev *mddev)
 {
-	struct stripe_head *sh;
 	struct r5conf *conf = mddev->private;
 	int handled;
 	struct blk_plug plug;
@@ -4598,6 +4621,7 @@ static void raid5d(struct mddev *mddev)
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct bio *bio;
+		int batch_size;
 
 		if (
 		    !list_empty(&conf->bitmap_list)) {
@@ -4621,21 +4645,16 @@ static void raid5d(struct mddev *mddev)
 			handled++;
 		}
 
-		sh = __get_priority_stripe(conf);
-
-		if (!sh)
+		batch_size = handle_active_stripes(conf);
+		if (!batch_size)
 			break;
-		spin_unlock_irq(&conf->device_lock);
-		
-		handled++;
-		handle_stripe(sh);
-		release_stripe(sh);
-		cond_resched();
+		handled += batch_size;
 
-		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+		if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+			spin_unlock_irq(&conf->device_lock);
 			md_check_recovery(mddev);
-
-		spin_lock_irq(&conf->device_lock);
+			spin_lock_irq(&conf->device_lock);
+		}
 	}
 	pr_debug("%d stripes handled\n", handled);
 
-- 
cgit v1.2.3


From f54a9d0e59c4bea3db733921ca9147612a6f292c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 2 Aug 2012 08:33:20 +1000
Subject: md/raid1: submit IO from originating thread instead of md thread.

queuing writes to the md thread means that all requests go through the
one processor which may not be able to keep up with very high request
rates.

So use the plugging infrastructure to submit all requests on unplug.
If a 'schedule' is needed, we fall back on the old approach of handing
the requests to the thread for it to handle.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c |  2 +-
 drivers/md/raid1.c  | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 15dbe03117e..94e7f6ba2e1 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1305,7 +1305,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 			prepare_to_wait(&bitmap->overflow_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&bitmap->counts.lock);
-			io_schedule();
+			schedule();
 			finish_wait(&bitmap->overflow_wait, &__wait);
 			continue;
 		}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36a8fc059ac..9f01870d031 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -870,6 +870,44 @@ do_sync_io:
 	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 }
 
+struct raid1_plug_cb {
+	struct blk_plug_cb	cb;
+	struct bio_list		pending;
+	int			pending_cnt;
+};
+
+static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
+						  cb);
+	struct mddev *mddev = plug->cb.data;
+	struct r1conf *conf = mddev->private;
+	struct bio *bio;
+
+	if (from_schedule) {
+		spin_lock_irq(&conf->device_lock);
+		bio_list_merge(&conf->pending_bio_list, &plug->pending);
+		conf->pending_count += plug->pending_cnt;
+		spin_unlock_irq(&conf->device_lock);
+		md_wakeup_thread(mddev->thread);
+		kfree(plug);
+		return;
+	}
+
+	/* we aren't scheduling, so we can do the write-out directly. */
+	bio = bio_list_get(&plug->pending);
+	bitmap_unplug(mddev->bitmap);
+	wake_up(&conf->wait_barrier);
+
+	while (bio) { /* submit pending writes */
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+		generic_make_request(bio);
+		bio = next;
+	}
+	kfree(plug);
+}
+
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r1conf *conf = mddev->private;
@@ -883,6 +921,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	struct md_rdev *blocked_rdev;
+	struct blk_plug_cb *cb;
+	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
@@ -1185,11 +1225,22 @@ read_again:
 		mbio->bi_private = r1_bio;
 
 		atomic_inc(&r1_bio->remaining);
+
+		cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
+		if (cb)
+			plug = container_of(cb, struct raid1_plug_cb, cb);
+		else
+			plug = NULL;
 		spin_lock_irqsave(&conf->device_lock, flags);
-		bio_list_add(&conf->pending_bio_list, mbio);
-		conf->pending_count++;
+		if (plug) {
+			bio_list_add(&plug->pending, mbio);
+			plug->pending_cnt++;
+		} else {
+			bio_list_add(&conf->pending_bio_list, mbio);
+			conf->pending_count++;
+		}
 		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!mddev_check_plugged(mddev))
+		if (!plug)
 			md_wakeup_thread(mddev->thread);
 	}
 	/* Mustn't call r1_bio_write_done before this next test,
-- 
cgit v1.2.3


From d9f691c365a83ce2530f0e46b947365c2db44ea0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 2 Aug 2012 08:35:43 +1000
Subject: md/dm-raid: DM_RAID should select MD_RAID10

Now that DM_RAID supports raid10, it needs to select that code
to ensure it is included.

Cc: Jonathan Brassow <jbrassow@redhat.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/Kconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 10f122a3a85..1de441a6c55 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -277,13 +277,14 @@ config DM_MIRROR
          needed for live data migration tools such as 'pvmove'.
 
 config DM_RAID
-       tristate "RAID 1/4/5/6 target"
+       tristate "RAID 1/4/5/6/10 target"
        depends on BLK_DEV_DM
        select MD_RAID1
+       select MD_RAID10
        select MD_RAID456
        select BLK_DEV_MD
        ---help---
-	 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
+	 A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings
 
 	 A RAID-5 set of N drives with a capacity of C MB per drive provides
 	 the capacity of C * (N - 1) MB, and protects against a failure
-- 
cgit v1.2.3