From 532a2a3fba8df076d65fdf17518eeb327b37a313 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:30:52 +1100 Subject: md: raid 10 supports TRIM This makes md raid 10 support TRIM. If one disk supports discard and another not, or one has discard_zero_data and another not, there could be inconsistent between data from such disks. But this should not matter, discarded data is useless. This will add extra copy in rebuild though. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid10.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1c2eb38f3c5..f92e0ed59be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -1061,6 +1066,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; int sectors_handled; @@ -1081,7 +1088,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -1410,7 +1417,7 @@ retry_write: conf->mirrors[d].rdev)); mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1439,7 +1446,7 @@ retry_write: conf->mirrors[d].replacement)); mbio->bi_bdev = conf->mirrors[d].replacement->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1723,6 +1730,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + print_conf(conf); return err; } @@ -3480,6 +3490,7 @@ static int run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; + bool discard_supported = false; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -3496,6 +3507,8 @@ static int run(struct mddev *mddev) chunk_size = mddev->chunk_sectors << 9; if (mddev->queue) { + blk_queue_max_discard_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); @@ -3541,8 +3554,16 @@ static int run(struct mddev *mddev) rdev->data_offset << 9); disk->head_position = 0; + + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", -- cgit v1.2.3 From 57c67df48866d57b50d72eb198ffcc0cf7a6232d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:32:13 +1100 Subject: md/raid10: submit IO from originating thread instead of md thread. queuing writes to the md thread means that all requests go through the one processor which may not be able to keep up with very high request rates. So use the plugging infrastructure to submit all requests on unplug. If a 'schedule' is needed, we fall back on the old approach of handing the requests to the thread for it to handle. This is nearly identical to a recent patch which provided similar functionality to RAID1. Signed-off-by: NeilBrown --- drivers/md/raid10.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f92e0ed59be..05dc96a950d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1055,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } +struct raid10_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; @@ -1070,6 +1108,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid10_plug_cb *plug = NULL; int sectors_handled; int max_sectors; int sectors; @@ -1421,11 +1461,22 @@ retry_write: mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) -- cgit v1.2.3 From 4ed8731d8e6bd2a88a30697fbf4f7e6e979a6c46 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:34:00 +1100 Subject: MD: change the parameter of md thread Change the thread parameter, so the thread can carry extra info. Next patch will use it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid10.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 05dc96a950d..54860604d09 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2732,8 +2732,9 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } } -static void raid10d(struct mddev *mddev) +static void raid10d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r10bio *r10_bio; unsigned long flags; struct r10conf *conf = mddev->private; -- cgit v1.2.3 From 2863b9eb44787adecba4f977d71d7fd876805b1c Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:38:58 +1100 Subject: MD RAID10: Prep for DM RAID10 device replacement capability MD RAID10: Fix a couple potential kernel panics if RAID10 is used by dm-raid When device-mapper uses the RAID10 personality through dm-raid.c, there is no 'gendisk' structure in mddev and some sysfs information is also not populated. This patch avoids touching those non-existent structures. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 54860604d09..fb5bd607e15 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1694,7 +1694,7 @@ static int raid10_spare_active(struct mddev *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); -- cgit v1.2.3 From 7f7583d420231b9d09897afd57a957011b606a5b Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 11 Oct 2012 14:17:59 +1100 Subject: Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races Now that multiple threads can handle stripes, it is safer to use an atomic64_t for resync_mismatches, to avoid update races. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fb5bd607e15..146749b277c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2011,7 +2011,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) break; if (j == vcnt) continue; - mddev->resync_mismatches += r10_bio->sectors; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue; -- cgit v1.2.3 From 91502f099dfc5a1e8812898e26ee280713e1d002 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 11 Oct 2012 14:20:58 +1100 Subject: md/raid10: use correct limit variable Clang complains that we are assigning a variable to itself. This should be using bad_sectors like the similar earlier check does. Bug has been present since 3.1-rc1. It is minor but could conceivably cause corruption or other bad behaviour. Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 146749b277c..867d1b4e963 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3218,7 +3218,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, else { bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) - max_sync = max_sync; + max_sync = bad_sectors; continue; } } -- cgit v1.2.3