aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-08-13 10:59:29 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2009-08-13 10:59:29 -0700
commitd58d2d1adec90e7bc0c56e09b3ac0e9a5a471e68 (patch)
tree17044053cfbde45da9bf8654ab6272b92773181c
parent7334219c44826ae0ebe6f07555c6b97f978ce266 (diff)
parent4d484a4a7a5126410eed5f8dd329a33f6eeed068 (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: md: allow upper limit for resync/reshape to be set when array is read-only md/raid5: Properly remove excess drives after shrinking a raid5/6 md/raid5: make sure a reshape restarts at the correct address. md/raid5: allow new reshape modes to be restarted in the middle. md: never advance 'events' counter by more than 1. Remove deadlock potential in md_open
-rw-r--r--drivers/md/md.c32
-rw-r--r--drivers/md/md.h10
-rw-r--r--drivers/md/raid5.c34
3 files changed, 56 insertions, 20 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5b98bea4ff9b..103f2d33fa89 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit)
else
new->md_minor = MINOR(unit) >> MdpMinorShift;
+ mutex_init(&new->open_mutex);
mutex_init(&new->reconfig_mutex);
INIT_LIST_HEAD(&new->disks);
INIT_LIST_HEAD(&new->all_mddevs);
@@ -1974,17 +1975,14 @@ repeat:
/* otherwise we have to go forward and ... */
mddev->events ++;
if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
- /* .. if the array isn't clean, insist on an odd 'events' */
- if ((mddev->events&1)==0) {
- mddev->events++;
+ /* .. if the array isn't clean, an 'even' event must also go
+ * to spares. */
+ if ((mddev->events&1)==0)
nospares = 0;
- }
} else {
- /* otherwise insist on an even 'events' (for clean states) */
- if ((mddev->events&1)) {
- mddev->events++;
+ /* otherwise an 'odd' event must go to spares */
+ if ((mddev->events&1))
nospares = 0;
- }
}
}
@@ -3601,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
if (max < mddev->resync_min)
return -EINVAL;
if (max < mddev->resync_max &&
+ mddev->ro == 0 &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
@@ -4304,12 +4303,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
struct gendisk *disk = mddev->gendisk;
mdk_rdev_t *rdev;
+ mutex_lock(&mddev->open_mutex);
if (atomic_read(&mddev->openers) > is_open) {
printk("md: %s still in use.\n",mdname(mddev));
- return -EBUSY;
- }
-
- if (mddev->pers) {
+ err = -EBUSY;
+ } else if (mddev->pers) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4367,7 +4365,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
set_disk_ro(disk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
-
+out:
+ mutex_unlock(&mddev->open_mutex);
+ if (err)
+ return err;
/*
* Free resources if final stop
*/
@@ -4433,7 +4434,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
blk_integrity_unregister(disk);
md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state);
-out:
return err;
}
@@ -5518,12 +5518,12 @@ static int md_open(struct block_device *bdev, fmode_t mode)
}
BUG_ON(mddev != bdev->bd_disk->private_data);
- if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
+ if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
goto out;
err = 0;
atomic_inc(&mddev->openers);
- mddev_unlock(mddev);
+ mutex_unlock(&mddev->open_mutex);
check_disk_change(bdev);
out:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 78f03168baf9..f8fc188bc762 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -223,6 +223,16 @@ struct mddev_s
* so we don't loop trying */
int in_sync; /* know to not need resync */
+ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+ * that we are never stopping an array while it is open.
+ * 'reconfig_mutex' protects all other reconfiguration.
+ * These locks are separate due to conflicting interactions
+ * with bdev->bd_mutex.
+ * Lock ordering is:
+ * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+ * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
+ */
+ struct mutex open_mutex;
struct mutex reconfig_mutex;
atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2b521ee67dfa..b8a2c5dc67ba 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
- } else if (mddev->delta_disks > 0 &&
+ } else if (mddev->delta_disks >= 0 &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks);
@@ -4509,7 +4509,26 @@ static int run(mddev_t *mddev)
(old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
- if (here_new >= here_old) {
+ if (mddev->delta_disks == 0) {
+ /* We cannot be sure it is safe to start an in-place
+ * reshape. It is only safe if user-space if monitoring
+ * and taking constant backups.
+ * mdadm always starts a situation like this in
+ * readonly mode so it can take control before
+ * allowing any writes. So just check for that.
+ */
+ if ((here_new * mddev->new_chunk_sectors !=
+ here_old * mddev->chunk_sectors) ||
+ mddev->ro == 0) {
+ printk(KERN_ERR "raid5: in-place reshape must be started"
+ " in read-only mode - aborting\n");
+ return -EINVAL;
+ }
+ } else if (mddev->delta_disks < 0
+ ? (here_new * mddev->new_chunk_sectors <=
+ here_old * mddev->chunk_sectors)
+ : (here_new * mddev->new_chunk_sectors >=
+ here_old * mddev->chunk_sectors)) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "raid5: reshape_position too early for "
"auto-recovery - aborting.\n");
@@ -5078,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev)
mddev->degraded--;
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
- d++)
- raid5_remove_disk(mddev, d);
+ d++) {
+ mdk_rdev_t *rdev = conf->disks[d].rdev;
+ if (rdev && raid5_remove_disk(mddev, d) == 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = -1;
+ }
+ }
}
mddev->layout = conf->algorithm;
mddev->chunk_sectors = conf->chunk_sectors;