From 532a2a3fba8df076d65fdf17518eeb327b37a313 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Thu, 11 Oct 2012 13:30:52 +1100
Subject: md: raid 10 supports TRIM

This makes md raid 10 support TRIM.

If one disk supports discard and another not, or one has
discard_zero_data and another not, there could be inconsistent between
data from such disks. But this should not matter, discarded data is
useless. This will add extra copy in rebuild though.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1c2eb38f3c5..f92e0ed59be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf)
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
 			bio->bi_next = NULL;
-			generic_make_request(bio);
+			if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+			    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+				/* Just ignore it */
+				bio_endio(bio, 0);
+			else
+				generic_make_request(bio);
 			bio = next;
 		}
 	} else
@@ -1061,6 +1066,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
+	const unsigned long do_discard = (bio->bi_rw
+					  & (REQ_DISCARD | REQ_SECURE));
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	int sectors_handled;
@@ -1081,7 +1088,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			 || conf->prev.near_copies < conf->prev.raid_disks))) {
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
-		if (bio->bi_vcnt != 1 ||
+		if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
 		    bio->bi_idx != 0)
 			goto bad_map;
 		/* This is a one page bio that upper layers
@@ -1410,7 +1417,7 @@ retry_write:
 						      conf->mirrors[d].rdev));
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua;
+		mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
@@ -1439,7 +1446,7 @@ retry_write:
 					   conf->mirrors[d].replacement));
 		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync | do_fua;
+		mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
@@ -1723,6 +1730,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
+	if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
 	print_conf(conf);
 	return err;
 }
@@ -3480,6 +3490,7 @@ static int run(struct mddev *mddev)
 	sector_t size;
 	sector_t min_offset_diff = 0;
 	int first = 1;
+	bool discard_supported = false;
 
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
@@ -3496,6 +3507,8 @@ static int run(struct mddev *mddev)
 
 	chunk_size = mddev->chunk_sectors << 9;
 	if (mddev->queue) {
+		blk_queue_max_discard_sectors(mddev->queue,
+					      mddev->chunk_sectors);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
 			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
@@ -3541,8 +3554,16 @@ static int run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 
 		disk->head_position = 0;
+
+		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+			discard_supported = true;
 	}
 
+	if (discard_supported)
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	else
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
 		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
-- 
cgit v1.2.3


From 57c67df48866d57b50d72eb198ffcc0cf7a6232d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 11 Oct 2012 13:32:13 +1100
Subject: md/raid10: submit IO from originating thread instead of md thread.

queuing writes to the md thread means that all requests go through the
one processor which may not be able to keep up with very high request
rates.

So use the plugging infrastructure to submit all requests on unplug.
If a 'schedule' is needed, we fall back on the old approach of handing
the requests to the thread for it to handle.

This is nearly identical to a recent patch which provided similar
functionality to RAID1.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f92e0ed59be..05dc96a950d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1055,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio,
 		return rdev->new_data_offset;
 }
 
+struct raid10_plug_cb {
+	struct blk_plug_cb	cb;
+	struct bio_list		pending;
+	int			pending_cnt;
+};
+
+static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
+						   cb);
+	struct mddev *mddev = plug->cb.data;
+	struct r10conf *conf = mddev->private;
+	struct bio *bio;
+
+	if (from_schedule) {
+		spin_lock_irq(&conf->device_lock);
+		bio_list_merge(&conf->pending_bio_list, &plug->pending);
+		conf->pending_count += plug->pending_cnt;
+		spin_unlock_irq(&conf->device_lock);
+		md_wakeup_thread(mddev->thread);
+		kfree(plug);
+		return;
+	}
+
+	/* we aren't scheduling, so we can do the write-out directly. */
+	bio = bio_list_get(&plug->pending);
+	bitmap_unplug(mddev->bitmap);
+	wake_up(&conf->wait_barrier);
+
+	while (bio) { /* submit pending writes */
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+		generic_make_request(bio);
+		bio = next;
+	}
+	kfree(plug);
+}
+
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r10conf *conf = mddev->private;
@@ -1070,6 +1108,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 					  & (REQ_DISCARD | REQ_SECURE));
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
+	struct blk_plug_cb *cb;
+	struct raid10_plug_cb *plug = NULL;
 	int sectors_handled;
 	int max_sectors;
 	int sectors;
@@ -1421,11 +1461,22 @@ retry_write:
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
+
+		cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
+		if (cb)
+			plug = container_of(cb, struct raid10_plug_cb, cb);
+		else
+			plug = NULL;
 		spin_lock_irqsave(&conf->device_lock, flags);
-		bio_list_add(&conf->pending_bio_list, mbio);
-		conf->pending_count++;
+		if (plug) {
+			bio_list_add(&plug->pending, mbio);
+			plug->pending_cnt++;
+		} else {
+			bio_list_add(&conf->pending_bio_list, mbio);
+			conf->pending_count++;
+		}
 		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!mddev_check_plugged(mddev))
+		if (!plug)
 			md_wakeup_thread(mddev->thread);
 
 		if (!r10_bio->devs[i].repl_bio)
-- 
cgit v1.2.3


From 4ed8731d8e6bd2a88a30697fbf4f7e6e979a6c46 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Thu, 11 Oct 2012 13:34:00 +1100
Subject: MD: change the parameter of md thread

Change the thread parameter, so the thread can carry extra info. Next patch
will use it.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 05dc96a950d..54860604d09 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2732,8 +2732,9 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 	}
 }
 
-static void raid10d(struct mddev *mddev)
+static void raid10d(struct md_thread *thread)
 {
+	struct mddev *mddev = thread->mddev;
 	struct r10bio *r10_bio;
 	unsigned long flags;
 	struct r10conf *conf = mddev->private;
-- 
cgit v1.2.3


From 2863b9eb44787adecba4f977d71d7fd876805b1c Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 11 Oct 2012 13:38:58 +1100
Subject: MD RAID10: Prep for DM RAID10 device replacement capability

MD RAID10:  Fix a couple potential kernel panics if RAID10 is used by dm-raid

When device-mapper uses the RAID10 personality through dm-raid.c, there is no
'gendisk' structure in mddev and some sysfs information is also not populated.

This patch avoids touching those non-existent structures.

Signed-off-by: Jonathan Brassow <jbrassow@rehdat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 54860604d09..fb5bd607e15 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1694,7 +1694,7 @@ static int raid10_spare_active(struct mddev *mddev)
 			   && !test_bit(Faulty, &tmp->rdev->flags)
 			   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;
-			sysfs_notify_dirent(tmp->rdev->sysfs_state);
+			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
 		}
 	}
 	spin_lock_irqsave(&conf->device_lock, flags);
-- 
cgit v1.2.3


From 7f7583d420231b9d09897afd57a957011b606a5b Mon Sep 17 00:00:00 2001
From: Jianpeng Ma <majianpeng@gmail.com>
Date: Thu, 11 Oct 2012 14:17:59 +1100
Subject: Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid
 races

Now that multiple threads can handle stripes, it is safer to
use an atomic64_t for resync_mismatches, to avoid update races.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fb5bd607e15..146749b277c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2011,7 +2011,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 					break;
 			if (j == vcnt)
 				continue;
-			mddev->resync_mismatches += r10_bio->sectors;
+			atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
 			if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
 				/* Don't fix anything. */
 				continue;
-- 
cgit v1.2.3


From 91502f099dfc5a1e8812898e26ee280713e1d002 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 11 Oct 2012 14:20:58 +1100
Subject: md/raid10: use correct limit variable

Clang complains that we are assigning a variable to itself.  This should
be using bad_sectors like the similar earlier check does.

Bug has been present since 3.1-rc1.  It is minor but could
conceivably cause corruption or other bad behaviour.

Cc: stable@vger.kernel.org
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 146749b277c..867d1b4e963 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3218,7 +3218,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 				else {
 					bad_sectors -= (sector - first_bad);
 					if (max_sync > bad_sectors)
-						max_sync = max_sync;
+						max_sync = bad_sectors;
 					continue;
 				}
 			}
-- 
cgit v1.2.3