aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bitmap.c152
-rw-r--r--drivers/md/bitmap.h22
-rw-r--r--drivers/md/dm-raid.c16
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c32
-rw-r--r--drivers/md/md.c140
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/raid0.c164
-rw-r--r--drivers/md/raid0.h11
-rw-r--r--drivers/md/raid1.c98
-rw-r--r--drivers/md/raid10.c187
-rw-r--r--drivers/md/raid5.c25
-rw-r--r--include/linux/raid/md_p.h6
14 files changed, 496 insertions, 374 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 045e086144a..3d0dfa7a89a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/buffer_head.h>
+#include <linux/seq_file.h>
#include "md.h"
#include "bitmap.h"
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap)
}
/*
- * just a placeholder - calls kmalloc for bitmap pages
- */
-static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
-{
- unsigned char *page;
-
- page = kzalloc(PAGE_SIZE, GFP_NOIO);
- if (!page)
- printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
- else
- pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
- bmname(bitmap), page);
- return page;
-}
-
-/*
- * for now just a placeholder -- just calls kfree for bitmap pages
- */
-static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
-{
- pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
- kfree(page);
-}
-
-/*
* check a page and, if necessary, allocate it (or hijack it if the alloc fails)
*
* 1) check to see if this page is allocated, if it's not then try to alloc
@@ -96,7 +72,7 @@ __acquires(bitmap->lock)
/* this page has not been allocated yet */
spin_unlock_irq(&bitmap->lock);
- mappage = bitmap_alloc_page(bitmap);
+ mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
spin_lock_irq(&bitmap->lock);
if (mappage == NULL) {
@@ -109,7 +85,7 @@ __acquires(bitmap->lock)
} else if (bitmap->bp[page].map ||
bitmap->bp[page].hijacked) {
/* somebody beat us to getting the page */
- bitmap_free_page(bitmap, mappage);
+ kfree(mappage);
return 0;
} else {
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
ptr = bitmap->bp[page].map;
bitmap->bp[page].map = NULL;
bitmap->missing_pages++;
- bitmap_free_page(bitmap, ptr);
+ kfree(ptr);
}
}
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
did_alloc = 1;
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (! test_bit(In_sync, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
@@ -445,18 +421,13 @@ out:
void bitmap_update_sb(struct bitmap *bitmap)
{
bitmap_super_t *sb;
- unsigned long flags;
if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
return;
if (bitmap->mddev->bitmap_info.external)
return;
- spin_lock_irqsave(&bitmap->lock, flags);
- if (!bitmap->sb_page) { /* no superblock */
- spin_unlock_irqrestore(&bitmap->lock, flags);
+ if (!bitmap->sb_page) /* no superblock */
return;
- }
- spin_unlock_irqrestore(&bitmap->lock, flags);
sb = kmap_atomic(bitmap->sb_page);
sb->events = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared)
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap)
/* keep the array size field of the bitmap superblock up to date */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
- if (!bitmap->mddev->persistent)
- goto success;
-
- /*
- * if we have a persistent array superblock, compare the
- * bitmap's UUID and event counter to the mddev's
- */
- if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
- printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
- bmname(bitmap));
- goto out;
- }
- events = le64_to_cpu(sb->events);
- if (events < bitmap->mddev->events) {
- printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
- "-- forcing full recovery\n", bmname(bitmap), events,
- (unsigned long long) bitmap->mddev->events);
- sb->state |= cpu_to_le32(BITMAP_STALE);
+ if (bitmap->mddev->persistent) {
+ /*
+ * We have a persistent array superblock, so compare the
+ * bitmap's UUID and event counter to the mddev's
+ */
+ if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
+ printk(KERN_INFO
+ "%s: bitmap superblock UUID mismatch\n",
+ bmname(bitmap));
+ goto out;
+ }
+ events = le64_to_cpu(sb->events);
+ if (events < bitmap->mddev->events) {
+ printk(KERN_INFO
+ "%s: bitmap file is out of date (%llu < %llu) "
+ "-- forcing full recovery\n",
+ bmname(bitmap), events,
+ (unsigned long long) bitmap->mddev->events);
+ sb->state |= cpu_to_le32(BITMAP_STALE);
+ }
}
-success:
+
/* assign fields using values from superblock */
bitmap->mddev->bitmap_info.chunksize = chunksize;
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -680,15 +653,10 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
enum bitmap_mask_op op)
{
bitmap_super_t *sb;
- unsigned long flags;
int old;
- spin_lock_irqsave(&bitmap->lock, flags);
- if (!bitmap->sb_page) { /* can't set the state */
- spin_unlock_irqrestore(&bitmap->lock, flags);
+ if (!bitmap->sb_page) /* can't set the state */
return 0;
- }
- spin_unlock_irqrestore(&bitmap->lock, flags);
sb = kmap_atomic(bitmap->sb_page);
old = le32_to_cpu(sb->state) & bits;
switch (op) {
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long bit;
struct page *page;
void *kaddr;
- unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
+ unsigned long chunk = block >> bitmap->chunkshift;
if (!bitmap->filemap)
return;
@@ -1069,10 +1037,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
kunmap_atomic(paddr);
if (b) {
/* if the disk bit is set, set the memory bit */
- int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap))
+ int needed = ((sector_t)(i+1) << bitmap->chunkshift
>= start);
bitmap_set_memory_bits(bitmap,
- (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
+ (sector_t)i << bitmap->chunkshift,
needed);
bit_cnt++;
}
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap)
static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
{
- sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t chunk = offset >> bitmap->chunkshift;
unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
bitmap->bp[page].count += inc;
bitmap_checkfree(bitmap, page);
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev)
bitmap->allclean = 0;
}
bmc = bitmap_get_counter(bitmap,
- (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+ (sector_t)j << bitmap->chunkshift,
&blocks, 0);
if (!bmc)
j |= PAGE_COUNTER_MASK;
@@ -1231,7 +1199,7 @@ void bitmap_daemon_work(struct mddev *mddev)
/* we can clear the bit */
*bmc = 0;
bitmap_count_page(bitmap,
- (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
+ (sector_t)j << bitmap->chunkshift,
-1);
/* clear the bit */
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock)
* The lock must have been taken with interrupts enabled.
* If !create, we don't release the lock.
*/
- sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t chunk = offset >> bitmap->chunkshift;
unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
sector_t csize;
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock)
if (bitmap->bp[page].hijacked ||
bitmap->bp[page].map == NULL)
- csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
+ csize = ((sector_t)1) << (bitmap->chunkshift +
PAGE_COUNTER_SHIFT - 1);
else
- csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
+ csize = ((sector_t)1) << bitmap->chunkshift;
*blocks = csize - (offset & (csize - 1));
if (err < 0)
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
set_page_attr(bitmap,
filemap_get_page(
bitmap,
- offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+ offset >> bitmap->chunkshift),
BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
else {
if (*bmc <= 2) {
set_page_attr(bitmap,
- filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
+ filemap_get_page(bitmap, offset >> bitmap->chunkshift),
BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->mddev->curr_resync_completed = sector;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
- sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
+ sector &= ~((1ULL << bitmap->chunkshift) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {
bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
struct page *page;
*bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1);
- page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
+ page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
unsigned long chunk;
for (chunk = s; chunk <= e; chunk++) {
- sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
+ sector_t sec = (sector_t)chunk << bitmap->chunkshift;
bitmap_set_memory_bits(bitmap, sec, 1);
spin_lock_irq(&bitmap->lock);
bitmap_file_set_bit(bitmap, sec);
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev)
goto error;
bitmap->daemon_lastrun = jiffies;
- bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
+ bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
+ - BITMAP_BLOCK_SHIFT);
/* now that chunksize and chunkshift are set, we can use these macros */
- chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
- CHUNK_BLOCK_SHIFT(bitmap);
+ chunks = (blocks + bitmap->chunkshift - 1) >>
+ bitmap->chunkshift;
pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
BUG_ON(!pages);
@@ -1836,6 +1805,33 @@ out:
}
EXPORT_SYMBOL_GPL(bitmap_load);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
+{
+ unsigned long chunk_kb;
+ unsigned long flags;
+
+ if (!bitmap)
+ return;
+
+ spin_lock_irqsave(&bitmap->lock, flags);
+ chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
+ seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+ "%lu%s chunk",
+ bitmap->pages - bitmap->missing_pages,
+ bitmap->pages,
+ (bitmap->pages - bitmap->missing_pages)
+ << (PAGE_SHIFT - 10),
+ chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
+ chunk_kb ? "KB" : "B");
+ if (bitmap->file) {
+ seq_printf(seq, ", file: ");
+ seq_path(seq, &bitmap->file->f_path, " \t\n");
+ }
+
+ seq_printf(seq, "\n");
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+}
+
static ssize_t
location_show(struct mddev *mddev, char *page)
{
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers) {
mddev->pers->quiesce(mddev, 1);
rv = bitmap_create(mddev);
+ if (!rv)
+ rv = bitmap_load(mddev);
if (rv) {
bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a15436dd9b3..55ca5aec84e 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,8 +13,6 @@
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_HOSTENDIAN 3
-#define BITMAP_MINOR 39
-
/*
* in-memory bitmap:
*
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
-#define BITMAP_BLOCK_SIZE 512
#define BITMAP_BLOCK_SHIFT 9
/* how many blocks per chunk? (this is variable) */
#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
-#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
-
-/* when hijacked, the counters and bits represent even larger "chunks" */
-/* there will be 1024 chunks represented by each counter in the page pointers */
-#define PAGEPTR_BLOCK_RATIO(bitmap) \
- (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
-#define PAGEPTR_BLOCK_SHIFT(bitmap) \
- (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
-#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
#endif
@@ -181,12 +168,6 @@ struct bitmap_page {
unsigned int count:31;
};
-/* keep track of bitmap file pages that have pending writes on them */
-struct page_list {
- struct list_head list;
- struct page *page;
-};
-
/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_page *bp;
@@ -196,7 +177,7 @@ struct bitmap {
struct mddev *mddev; /* the md device that the bitmap is for */
/* bitmap chunksize -- how much data does each bit represent? */
- unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+ unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
unsigned long chunks; /* total number of data chunks for the array */
__u64 events_cleared;
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
void bitmap_print_sb(struct bitmap *bitmap);
void bitmap_update_sb(struct bitmap *bitmap);
+void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
int bitmap_setallbits(struct bitmap *bitmap);
void bitmap_write_all(struct bitmap *bitmap);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 787022c1818..c5a875d7b88 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -615,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
{
- struct md_rdev *r, *t;
+ struct md_rdev *r;
uint64_t failed_devices;
struct dm_raid_superblock *sb;
sb = page_address(rdev->sb_page);
failed_devices = le64_to_cpu(sb->failed_devices);
- rdev_for_each(r, t, mddev)
+ rdev_for_each(r, mddev)
if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
failed_devices |= (1ULL << r->raid_disk);
@@ -707,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
struct dm_raid_superblock *sb;
uint32_t new_devs = 0;
uint32_t rebuilds = 0;
- struct md_rdev *r, *t;
+ struct md_rdev *r;
struct dm_raid_superblock *sb2;
sb = page_address(rdev->sb_page);
@@ -750,7 +750,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
* case the In_sync bit will /not/ be set and
* recovery_cp must be MaxSector.
*/
- rdev_for_each(r, t, mddev) {
+ rdev_for_each(r, mddev) {
if (!test_bit(In_sync, &r->flags)) {
DMINFO("Device %d specified for rebuild: "
"Clearing superblock", r->raid_disk);
@@ -782,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
* Now we set the Faulty bit for those devices that are
* recorded in the superblock as failed.
*/
- rdev_for_each(r, t, mddev) {
+ rdev_for_each(r, mddev) {
if (!r->sb_page)
continue;
sb2 = page_address(r->sb_page);
@@ -855,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{
int ret;
- struct md_rdev *rdev, *freshest, *tmp;
+ struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md;
freshest = NULL;
- rdev_for_each(rdev, tmp, mddev) {
+ rdev_for_each(rdev, mddev) {
if (!rdev->meta_bdev)
continue;
@@ -888,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (super_validate(mddev, freshest))
return -EINVAL;
- rdev_for_each(rdev, tmp, mddev)
+ rdev_for_each(rdev, mddev)
if ((rdev != freshest) && super_validate(mddev, rdev))
return -EINVAL;
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index feb2c3c7bb4..45135f69509 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
}
conf->nfaults = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
conf->rdev = rdev;
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 627456542fb..b0fcc7d02ad 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
struct dev_info *dev0;
unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int maxbytes = biovec->bv_len;
+ struct request_queue *subq;
rcu_read_lock();
dev0 = which_dev(mddev, sector);
maxsectors = dev0->end_sector - sector;
+ subq = bdev_get_queue(dev0->rdev->bdev);
+ if (subq->merge_bvec_fn) {
+ bvm->bi_bdev = dev0->rdev->bdev;
+ bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
+ maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
+ biovec));
+ }
rcu_read_unlock();
if (maxsectors < bio_sectors)
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q,
maxsectors -= bio_sectors;
if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
- return biovec->bv_len;
- /* The bytes available at this offset could be really big,
- * so we cap at 2^31 to avoid overflow */
- if (maxsectors > (1 << (31-9)))
- return 1<<31;
- return maxsectors << 9;
+ return maxbytes;
+
+ if (maxsectors > (maxbytes >> 9))
+ return maxbytes;
+ else
+ return maxsectors << 9;
}
static int linear_congested(void *data, int bits)
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
cnt = 0;
conf->array_sectors = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
int j = rdev->raid_disk;
struct dev_info *disk = conf->disks + j;
sector_t sectors;
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit max_segments to 1 lying within
- * a single page.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
conf->array_sectors += rdev->sectors;
cnt++;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ce88755baf4..b572e1e386c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws)
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
- list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+ rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) {
/* Take two references, one is dropped
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
{
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->desc_nr == nr)
return rdev;
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
{
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->bdev->bd_dev == dev)
return rdev;
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->state |= (1<<MD_SB_BITMAP_PRESENT);
sb->disks[0].state = (1<<MD_DISK_REMOVED);
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev2, mddev) {
mdp_disk_t *d;
int desc_nr;
int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1805,18 +1805,18 @@ retry:
| BB_LEN(internal_bb));
*bbp++ = cpu_to_le64(store_bb);
}
+ bb->changed = 0;
if (read_seqretry(&bb->lock, seq))
goto retry;
bb->sector = (rdev->sb_start +
(int)le32_to_cpu(sb->bblog_offset));
bb->size = le16_to_cpu(sb->bblog_size);
- bb->changed = 0;
}
}
max_dev = 0;
- list_for_each_entry(rdev2, &mddev->disks, same_set)
+ rdev_for_each(rdev2, mddev)
if (rdev2->desc_nr+1 > max_dev)
max_dev = rdev2->desc_nr+1;
@@ -1833,7 +1833,7 @@ retry:
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev)
return 0; /* nothing to do */
if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
return 0; /* shouldn't register, or already is */
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
/* skip spares and non-functional disks */
if (test_bit(Faulty, &rdev->flags))
continue;
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev)
{
struct md_rdev *rdev, *tmp;
- rdev_for_each(rdev, tmp, mddev) {
+ rdev_for_each_safe(rdev, tmp, mddev) {
if (!rdev->mddev) {
MD_BUG();
continue;
@@ -2307,11 +2307,11 @@ static void md_print_devices(void)
bitmap_print_sb(mddev->bitmap);
else
printk("%s: ", mdname(mddev));
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
printk("<%s>", bdevname(rdev->bdev,b));
printk("\n");
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
print_rdev(rdev, mddev->major_version);
}
printk("md: **********************************\n");
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
* with the rest of the array)
*/
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->sb_events == mddev->events ||
(nospares &&
rdev->raid_disk < 0 &&
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
repeat:
/* First make sure individual recovery_offsets are correct */
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
@@ -2364,8 +2364,9 @@ repeat:
clear_bit(MD_CHANGE_DEVS, &mddev->flags);
if (!mddev->external) {
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed) {
+ rdev->badblocks.changed = 0;
md_ack_all_badblocks(&rdev->badblocks);
md_error(mddev, rdev);
}
@@ -2430,7 +2431,7 @@ repeat:
mddev->events --;
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed)
any_badblocks_changed++;
if (test_bit(Faulty, &rdev->flags))
@@ -2444,7 +2445,7 @@ repeat:
mdname(mddev), mddev->in_sync);
bitmap_update_sb(mddev->bitmap);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
char b[BDEVNAME_SIZE];
if (rdev->sb_loaded != 1)
@@ -2493,7 +2494,7 @@ repeat:
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (test_and_clear_bit(FaultRecorded, &rdev->flags))
clear_bit(Blocked, &rdev->flags);
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
struct md_rdev *rdev2;
mddev_lock(mddev);
- list_for_each_entry(rdev2, &mddev->disks, same_set)
+ rdev_for_each(rdev2, mddev)
if (rdev->bdev == rdev2->bdev &&
rdev != rdev2 &&
overlaps(rdev->data_offset, rdev->sectors,
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev)
char b[BDEVNAME_SIZE];
freshest = NULL;
- rdev_for_each(rdev, tmp, mddev)
+ rdev_for_each_safe(rdev, tmp, mddev)
switch (super_types[mddev->major_version].
load_super(rdev, freshest, mddev->minor_version)) {
case 1:
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev)
validate_super(mddev, freshest);
i = 0;
- rdev_for_each(rdev, tmp, mddev) {
+ rdev_for_each_safe(rdev, tmp, mddev) {
if (mddev->max_disks &&
(rdev->desc_nr >= mddev->max_disks ||
i > mddev->max_disks)) {
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
return -EINVAL;
}
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
rdev->new_raid_disk = rdev->raid_disk;
/* ->takeover must set new_* and/or delta_disks
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->safemode = 0;
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
continue;
if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
continue;
sysfs_unlink_rdev(mddev, rdev);
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0)
continue;
if (rdev->new_raid_disk == rdev->raid_disk)
@@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev)
* the only valid external interface is through the md
* device.
*/
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (test_bit(Faulty, &rdev->flags))
continue;
sync_blockdev(rdev->bdev);
@@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev)
struct md_rdev *rdev2;
int warned = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev)
+ rdev_for_each(rdev2, mddev) {
if (rdev < rdev2 &&
rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
@@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev)
mddev->in_sync = 1;
smp_wmb();
mddev->ready = 1;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
@@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev)
mddev->changed = 0;
mddev->degraded = 0;
mddev->safemode = 0;
+ mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev->bitmap_info.chunksize = 0;
@@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent_safe(mddev->sysfs_state);
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
sysfs_unlink_rdev(mddev, rdev);
@@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev)
printk(KERN_INFO "md: running: ");
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
char b[BDEVNAME_SIZE];
printk("<%s>", bdevname(rdev->bdev,b));
}
@@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
struct md_rdev *rdev;
nr=working=insync=failed=spare=0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
nr++;
if (test_bit(Faulty, &rdev->flags))
failed++;
@@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
* grow, and re-add.
*/
return -EBUSY;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
sector_t avail = rdev->sectors;
if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
struct mddev *mddev = v;
sector_t sectors;
struct md_rdev *rdev;
- struct bitmap *bitmap;
if (v == (void*)1) {
struct md_personality *pers;
@@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
}
sectors = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
} else
seq_printf(seq, "\n ");
- if ((bitmap = mddev->bitmap)) {
- unsigned long chunk_kb;
- unsigned long flags;
- spin_lock_irqsave(&bitmap->lock, flags);
- chunk_kb = mddev->bitmap_info.chunksize >> 10;
- seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
- "%lu%s chunk",
- bitmap->pages - bitmap->missing_pages,
- bitmap->pages,
- (bitmap->pages - bitmap->missing_pages)
- << (PAGE_SHIFT - 10),
- chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
- chunk_kb ? "KB" : "B");
- if (bitmap->file) {
- seq_printf(seq, ", file: ");
- seq_path(seq, &bitmap->file->f_path, " \t\n");
- }
-
- seq_printf(seq, "\n");
- spin_unlock_irqrestore(&bitmap->lock, flags);
- }
+ bitmap_status(seq, mddev->bitmap);
seq_printf(seq, "\n");
}
@@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev)
max_sectors = mddev->dev_sectors;
j = MaxSector;
rcu_read_lock();
- list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+ rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
@@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev)
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
rcu_read_lock();
- list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+ rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
@@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev)
mddev->curr_resync_completed = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
@@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev)
"degraded");
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
@@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev)
* do the superblock for an incrementally recovered device
* written out.
*/
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (!mddev->degraded ||
test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = -1;
@@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev)
* failed devices.
*/
struct md_rdev *rdev;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
test_bit(Faulty, &rdev->flags) &&
@@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
return;
write_seqlock_irq(&bb->lock);
- if (bb->changed == 0) {
+ if (bb->changed == 0 && bb->unacked_exist) {
u64 *p = bb->page;
int i;
for (i = 0; i < bb->count ; i++) {
@@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this,
struct mddev *mddev;
int need_delay = 0;
- if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
-
- printk(KERN_INFO "md: stopping all md devices.\n");
-
- for_each_mddev(mddev, tmp) {
- if (mddev_trylock(mddev)) {
- /* Force a switch to readonly even array
- * appears to still be in use. Hence
- * the '100'.
- */
- md_set_readonly(mddev, 100);
- mddev_unlock(mddev);
- }
- need_delay = 1;
+ for_each_mddev(mddev, tmp) {
+ if (mddev_trylock(mddev)) {
+ __md_stop_writes(mddev);
+ mddev->safemode = 2;
+ mddev_unlock(mddev);
}
- /*
- * certain more exotic SCSI devices are known to be
- * volatile wrt too early system reboots. While the
- * right place to handle this issue is the given
- * driver, we do want to have a safe RAID driver ...
- */
- if (need_delay)
- mdelay(1000*1);
+ need_delay = 1;
}
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ if (need_delay)
+ mdelay(1000*1);
+
return NOTIFY_DONE;
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 44c63dfeeb2..1c2063ccf48 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -128,6 +128,10 @@ struct md_rdev {
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
+ Unmerged, /* device is being added to array and should
+ * be considerred for bvec_merge_fn but not
+ * yet for actual IO
+ */
WriteMostly, /* Avoid reading if at all possible */
AutoDetected, /* added by auto-detect */
Blocked, /* An error occurred but has not yet
@@ -345,6 +349,10 @@ struct mddev {
int degraded; /* whether md should consider
* adding a spare
*/
+ int merge_check_needed; /* at least one
+ * member device
+ * has a
+ * merge_bvec_fn */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
/*
* iterates through the 'same array disks' ringlist
*/
-#define rdev_for_each(rdev, tmp, mddev) \
+#define rdev_for_each(rdev, mddev) \
+ list_for_each_entry(rdev, &((mddev)->disks), same_set)
+
+#define rdev_for_each_safe(rdev, tmp, mddev) \
list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
#define rdev_for_each_rcu(rdev, mddev) \
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a222f516660..9339e67fcc7 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
}
working_disks = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx < 0 ||
disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7294bd115e3..6f31f5596e0 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
if (!conf)
return -ENOMEM;
- list_for_each_entry(rdev1, &mddev->disks, same_set) {
+ rdev_for_each(rdev1, mddev) {
pr_debug("md/raid0:%s: looking at %s\n",
mdname(mddev),
bdevname(rdev1->bdev, b));
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
- list_for_each_entry(rdev2, &mddev->disks, same_set) {
+ rdev_for_each(rdev2, mddev) {
pr_debug("md/raid0:%s: comparing %s(%llu)"
" with %s(%llu)\n",
mdname(mddev),
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
smallest = NULL;
dev = conf->devlist;
err = -EINVAL;
- list_for_each_entry(rdev1, &mddev->disks, same_set) {
+ rdev_for_each(rdev1, mddev) {
int j = rdev1->raid_disk;
if (mddev->level == 10) {
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
disk_stack_limits(mddev->gendisk, rdev1->bdev,
rdev1->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_segments to 1, lying within
- * a single page.
- */
- if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
+ if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
+ conf->has_merge_bvec = 1;
+
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
@@ -290,8 +284,64 @@ abort:
return err;
}
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct r0conf *conf,
+ sector_t *sectorp)
+{
+ int i;
+ struct strip_zone *z = conf->strip_zone;
+ sector_t sector = *sectorp;
+
+ for (i = 0; i < conf->nr_strip_zones; i++)
+ if (sector < z[i].zone_end) {
+ if (i)
+ *sectorp = sector - z[i-1].zone_end;
+ return z + i;
+ }
+ BUG();
+}
+
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
+ sector_t sector, sector_t *sector_offset)
+{
+ unsigned int sect_in_chunk;
+ sector_t chunk;
+ struct r0conf *conf = mddev->private;
+ int raid_disks = conf->strip_zone[0].nb_dev;
+ unsigned int chunk_sects = mddev->chunk_sectors;
+
+ if (is_power_of_2(chunk_sects)) {
+ int chunksect_bits = ffz(~chunk_sects);
+ /* find the sector offset inside the chunk */
+ sect_in_chunk = sector & (chunk_sects - 1);
+ sector >>= chunksect_bits;
+ /* chunk in zone */
+ chunk = *sector_offset;
+ /* quotient is the chunk in real device*/
+ sector_div(chunk, zone->nb_dev << chunksect_bits);
+ } else{
+ sect_in_chunk = sector_div(sector, chunk_sects);
+ chunk = *sector_offset;
+ sector_div(chunk, chunk_sects * zone->nb_dev);
+ }
+ /*
+ * position the bio over the real device
+ * real sector = chunk in device + starting of zone
+ * + the position in the chunk
+ */
+ *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+ return conf->devlist[(zone - conf->strip_zone)*raid_disks
+ + sector_div(sector, zone->nb_dev)];
+}
+
/**
- * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
+ * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
* @q: request queue
* @bvm: properties of new bio
* @biovec: the request that could be merged to it.
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
struct bio_vec *biovec)
{
struct mddev *mddev = q->queuedata;
+ struct r0conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ sector_t sector_offset = sector;
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
+ struct strip_zone *zone;
+ struct md_rdev *rdev;
+ struct request_queue *subq;
if (is_power_of_2(chunk_sectors))
max = (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
else
max = (chunk_sectors - (sector_div(sector, chunk_sectors)
+ bio_sectors)) << 9;
- if (max < 0) max = 0; /* bio_add cannot handle a negative return */
+ if (max < 0)
+ max = 0; /* bio_add cannot handle a negative return */
if (max <= biovec->bv_len && bio_sectors == 0)
return biovec->bv_len;
- else
+ if (max < biovec->bv_len)
+ /* too small already, no need to check further */
+ return max;
+ if (!conf->has_merge_bvec)
+ return max;
+
+ /* May need to check subordinate device */
+ sector = sector_offset;
+ zone = find_zone(mddev->private, &sector_offset);
+ rdev = map_sector(mddev, zone, sector, &sector_offset);
+ subq = bdev_get_queue(rdev->bdev);
+ if (subq->merge_bvec_fn) {
+ bvm->bi_bdev = rdev->bdev;
+ bvm->bi_sector = sector_offset + zone->dev_start +
+ rdev->data_offset;
+ return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
+ } else
return max;
}
@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
array_sectors += rdev->sectors;
return array_sectors;
@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev)
return 0;
}
-/* Find the zone which holds a particular offset
- * Update *sectorp to be an offset in that zone
- */
-static struct strip_zone *find_zone(struct r0conf *conf,
- sector_t *sectorp)
-{
- int i;
- struct strip_zone *z = conf->strip_zone;
- sector_t sector = *sectorp;
-
- for (i = 0; i < conf->nr_strip_zones; i++)
- if (sector < z[i].zone_end) {
- if (i)
- *sectorp = sector - z[i-1].zone_end;
- return z + i;
- }
- BUG();
-}
-
-/*
- * remaps the bio to the target device. we separate two flows.
- * power 2 flow and a general flow for the sake of perfromance
-*/
-static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
- sector_t sector, sector_t *sector_offset)
-{
- unsigned int sect_in_chunk;
- sector_t chunk;
- struct r0conf *conf = mddev->private;
- int raid_disks = conf->strip_zone[0].nb_dev;
- unsigned int chunk_sects = mddev->chunk_sectors;
-
- if (is_power_of_2(chunk_sects)) {
- int chunksect_bits = ffz(~chunk_sects);
- /* find the sector offset inside the chunk */
- sect_in_chunk = sector & (chunk_sects - 1);
- sector >>= chunksect_bits;
- /* chunk in zone */
- chunk = *sector_offset;
- /* quotient is the chunk in real device*/
- sector_div(chunk, zone->nb_dev << chunksect_bits);
- } else{
- sect_in_chunk = sector_div(sector, chunk_sects);
- chunk = *sector_offset;
- sector_div(chunk, chunk_sects * zone->nb_dev);
- }
- /*
- * position the bio over the real device
- * real sector = chunk in device + starting of zone
- * + the position in the chunk
- */
- *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
- return conf->devlist[(zone - conf->strip_zone)*raid_disks
- + sector_div(sector, zone->nb_dev)];
-}
-
/*
* Is io distribute over 1 or more chunks ?
*/
@@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
}
sector_offset = bio->bi_sector;
- zone = find_zone(mddev->private, &sector_offset);
+ zone = find_zone(mddev->private, &sector_offset);
tmp_dev = map_sector(mddev, zone, bio->bi_sector,
&sector_offset);
bio->bi_bdev = tmp_dev->bdev;
@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
return ERR_PTR(-EINVAL);
}
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
/* check slot number for a disk */
if (rdev->raid_disk == mddev->raid_disks-1) {
printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 0884bba8df4..05539d9c97f 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -4,13 +4,16 @@
struct strip_zone {
sector_t zone_end; /* Start of the next zone (in sectors) */
sector_t dev_start; /* Zone offset in real dev (in sectors) */
- int nb_dev; /* # of devices attached to the zone */
+ int nb_dev; /* # of devices attached to the zone */
};
struct r0conf {
- struct strip_zone *strip_zone;
- struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
- int nr_strip_zones;
+ struct strip_zone *strip_zone;
+ struct md_rdev **devlist; /* lists of rdevs, pointed to
+ * by strip_zone->dev */
+ int nr_strip_zones;
+ int has_merge_bvec; /* at least one member has
+ * a merge_bvec_fn */
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a0b225eb4ac..4a40a200d76 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
+ || test_bit(Unmerged, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
+static int raid1_mergeable_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mddev *mddev = q->queuedata;
+ struct r1conf *conf = mddev->private;
+ sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+ int max = biovec->bv_len;
+
+ if (mddev->merge_check_needed) {
+ int disk;
+ rcu_read_lock();
+ for (disk = 0; disk < conf->raid_disks * 2; disk++) {
+ struct md_rdev *rdev = rcu_dereference(
+ conf->mirrors[disk].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = sector +
+ rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ }
+ rcu_read_unlock();
+ }
+ return max;
+
+}
+
int md_raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
@@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf)
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
conf->nr_waiting++;
- wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+ /* Wait for the barrier to drop.
+ * However if there are already pending
+ * requests (preventing the barrier from
+ * rising completely), and the
+ * pre-process bio queue isn't empty,
+ * then don't wait, as we need to empty
+ * that queue to get the nr_pending
+ * count down.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->barrier ||
+ (conf->nr_pending &&
+ current->bio_list &&
+ !bio_list_empty(current->bio_list)),
conf->resync_lock,
- );
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -1002,7 +1049,8 @@ read_again:
break;
}
r1_bio->bios[i] = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
+ if (!rdev || test_bit(Faulty, &rdev->flags)
+ || test_bit(Unmerged, &rdev->flags)) {
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct mirror_info *p;
int first = 0;
int last = conf->raid_disks - 1;
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
+ if (q->merge_bvec_fn) {
+ set_bit(Unmerged, &rdev->flags);
+ mddev->merge_check_needed = 1;
+ }
+
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must
- * never risk violating it, so limit
- * ->max_segments to one lying with a single
- * page, as a one page request is never in
- * violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
p->head_position = 0;
rdev->raid_disk = mirror;
@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
}
+ if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+ /* Some requests might not have seen this new
+ * merge_bvec_fn. We must wait for them to complete
+ * before merging the device fully.
+ * First we make sure any code which has tested
+ * our function has submitted the request, then
+ * we wait for all outstanding requests to complete.
+ */
+ synchronize_sched();
+ raise_barrier(conf);
+ lower_barrier(conf);
+ clear_bit(Unmerged, &rdev->flags);
+ }
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL;
spin_lock_init(&conf->device_lock);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_segments to 1 lying within
- * a single page, as a one page request is never in violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
}
mddev->degraded = 0;
@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
+ blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
}
return md_integrity_register(mddev);
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 58c44d6453a..3540316886f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -586,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* @biovec: the request that could be merged to it.
*
* Return amount of bytes we can accept at this offset
- * If near_copies == raid_disk, there are no striping issues,
- * but in that case, the function isn't called at all.
+ * This requires checking for end-of-chunk if near_copies != raid_disks,
+ * and for subordinate merge_bvec_fns if merge_check_needed.
*/
static int raid10_mergeable_bvec(struct request_queue *q,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
struct mddev *mddev = q->queuedata;
+ struct r10conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max;
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
- max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
- if (max < 0) max = 0; /* bio_add cannot handle a negative return */
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- else
- return max;
+ if (conf->near_copies < conf->raid_disks) {
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ + bio_sectors)) << 9;
+ if (max < 0)
+ /* bio_add cannot handle a negative return */
+ max = 0;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ } else
+ max = biovec->bv_len;
+
+ if (mddev->merge_check_needed) {
+ struct r10bio r10_bio;
+ int s;
+ r10_bio.sector = sector;
+ raid10_find_phys(conf, &r10_bio);
+ rcu_read_lock();
+ for (s = 0; s < conf->copies; s++) {
+ int disk = r10_bio.devs[s].devnum;
+ struct md_rdev *rdev = rcu_dereference(
+ conf->mirrors[disk].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = r10_bio.devs[s].addr
+ + rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q =
+ bdev_get_queue(rdev->bdev);
+ if (q->merge_bvec_fn) {
+ bvm->bi_sector = r10_bio.devs[s].addr
+ + rdev->data_offset;
+ bvm->bi_bdev = rdev->bdev;
+ max = min(max, q->merge_bvec_fn(
+ q, bvm, biovec));
+ }
+ }
+ }
+ rcu_read_unlock();
+ }
+ return max;
}
/*
@@ -668,11 +711,12 @@ retry:
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
rdev = rcu_dereference(conf->mirrors[disk].rdev);
- if (rdev == NULL)
- continue;
- if (test_bit(Faulty, &rdev->flags))
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -863,9 +907,22 @@ static void wait_barrier(struct r10conf *conf)
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
conf->nr_waiting++;
- wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+ /* Wait for the barrier to drop.
+ * However if there are already pending
+ * requests (preventing the barrier from
+ * rising completely), and the
+ * pre-process bio queue isn't empty,
+ * then don't wait, as we need to empty
+ * that queue to get the nr_pending
+ * count down.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->barrier ||
+ (conf->nr_pending &&
+ current->bio_list &&
+ !bio_list_empty(current->bio_list)),
conf->resync_lock,
- );
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
@@ -1121,12 +1178,14 @@ retry_write:
blocked_rdev = rrdev;
break;
}
- if (rrdev && test_bit(Faulty, &rrdev->flags))
+ if (rrdev && (test_bit(Faulty, &rrdev->flags)
+ || test_bit(Unmerged, &rrdev->flags)))
rrdev = NULL;
r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ test_bit(Unmerged, &rdev->flags)) {
set_bit(R10BIO_Degraded, &r10_bio->state);
continue;
}
@@ -1477,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int mirror;
int first = 0;
int last = conf->raid_disks - 1;
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
* very different from resync
*/
return -EBUSY;
- if (!enough(conf, -1))
+ if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
return -EINVAL;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
+ if (q->merge_bvec_fn) {
+ set_bit(Unmerged, &rdev->flags);
+ mddev->merge_check_needed = 1;
+ }
+
if (rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
@@ -1508,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
err = 0;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
@@ -1520,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must
- * never risk violating it, so limit
- * ->max_segments to one lying with a single
- * page, as a one page request is never in
- * violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1541,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
-
+ if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+ /* Some requests might not have seen this new
+ * merge_bvec_fn. We must wait for them to complete
+ * before merging the device fully.
+ * First we make sure any code which has tested
+ * our function has submitted the request, then
+ * we wait for all outstanding requests to complete.
+ */
+ synchronize_sched();
+ raise_barrier(conf, 0);
+ lower_barrier(conf);
+ clear_bit(Unmerged, &rdev->flags);
+ }
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
@@ -1682,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error)
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[d].replacement;
- if (!rdev) {
- smp_mb();
+ else
rdev = conf->mirrors[d].rdev;
- }
if (!uptodate) {
if (repl)
@@ -2087,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev &&
+ !test_bit(Unmerged, &rdev->flags) &&
test_bit(In_sync, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
&first_bad, &bad_sectors) == 0) {
@@ -2140,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (!rdev ||
+ test_bit(Unmerged, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;
@@ -3242,7 +3303,7 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks / conf->near_copies));
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx >= conf->raid_disks
@@ -3262,15 +3323,6 @@ static int run(struct mddev *mddev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit max_segments to 1 lying
- * within a single page.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
- blk_queue_max_segments(mddev->queue, 1);
- blk_queue_segment_boundary(mddev->queue,
- PAGE_CACHE_SIZE - 1);
- }
disk->head_position = 0;
}
@@ -3334,8 +3386,7 @@ static int run(struct mddev *mddev)
mddev->queue->backing_dev_info.ra_pages = 2* stripe;
}
- if (conf->near_copies < conf->raid_disks)
- blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+ blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -3385,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state)
}
}
+static int raid10_resize(struct mddev *mddev, sector_t sectors)
+{
+ /* Resize of 'far' arrays is not supported.
+ * For 'near' and 'offset' arrays we can set the
+ * number of sectors used to be an appropriate multiple
+ * of the chunk size.
+ * For 'offset', this is far_copies*chunksize.
+ * For 'near' the multiplier is the LCM of
+ * near_copies and raid_disks.
+ * So if far_copies > 1 && !far_offset, fail.
+ * Else find LCM(raid_disks, near_copy)*far_copies and
+ * multiply by chunk_size. Then round to this number.
+ * This is mostly done by raid10_size()
+ */
+ struct r10conf *conf = mddev->private;
+ sector_t oldsize, size;
+
+ if (conf->far_copies > 1 && !conf->far_offset)
+ return -EINVAL;
+
+ oldsize = raid10_size(mddev, 0, 0);
+ size = raid10_size(mddev, sectors, 0);
+ md_set_array_sectors(mddev, size);
+ if (mddev->array_sectors > size)
+ return -EINVAL;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > oldsize) {
+ mddev->recovery_cp = oldsize;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->dev_sectors = sectors;
+ mddev->resync_max_sectors = size;
+ return 0;
+}
+
static void *raid10_takeover_raid0(struct mddev *mddev)
{
struct md_rdev *rdev;
@@ -3408,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
conf = setup_conf(mddev);
if (!IS_ERR(conf)) {
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
rdev->new_raid_disk = rdev->raid_disk * 2;
conf->barrier = 1;
@@ -3454,6 +3542,7 @@ static struct md_personality raid10_personality =
.sync_request = sync_request,
.quiesce = raid10_quiesce,
.size = raid10_size,
+ .resize = raid10_resize,
.takeover = raid10_takeover,
};
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 360f2b98f62..23ac880bba9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
md_wakeup_thread(conf->mddev->thread);
} else {
BUG_ON(stripe_operations_active(sh));
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ if (atomic_dec_return(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
- }
atomic_dec(&conf->active_stripes);
if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
list_add_tail(&sh->lru, &conf->inactive_list);
@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
pr_debug("raid456: run(%s) called.\n", mdname(mddev));
- list_for_each_entry(rdev, &mddev->disks, same_set) {
+ rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
|| raid_disk < 0)
@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
}
@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
- if (has_failed(conf))
+ if (rdev->saved_raid_disk < 0 && has_failed(conf))
/* no point adding a device */
return -EINVAL;
@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev)
if (!check_stripe_cache(mddev))
return -ENOSPC;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (!test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags))
spares++;
@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev)
* such devices during the reshape and confusion could result.
*/
if (mddev->delta_disks >= 0) {
- int added_devices = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev_for_each(rdev, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev) == 0) {
if (rdev->raid_disk
- >= conf->previous_raid_disks) {
+ >= conf->previous_raid_disks)
set_bit(In_sync, &rdev->flags);
- added_devices++;
- } else
+ else
rdev->recovery_offset = 0;
if (sysfs_link_rdev(mddev, rdev))
@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev)
&& !test_bit(Faulty, &rdev->flags)) {
/* This is a spare that was manually added */
set_bit(In_sync, &rdev->flags);
- added_devices++;
}
/* When a reshape changes the number of devices,
@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev)
spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
conf->reshape_progress = MaxSector;
+ mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
return -EAGAIN;
}
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 6f6df86f1ae..8c0a3adc5df 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -281,6 +281,10 @@ struct mdp_superblock_1 {
* active device with same 'role'.
* 'recovery_offset' is also set.
*/
-#define MD_FEATURE_ALL (1|2|4|8|16)
+#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
+ |MD_FEATURE_RECOVERY_OFFSET \
+ |MD_FEATURE_RESHAPE_ACTIVE \
+ |MD_FEATURE_BAD_BLOCKS \
+ |MD_FEATURE_REPLACEMENT)
#endif