diff options
Diffstat (limited to 'drivers/md')
40 files changed, 777 insertions, 288 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3bfc8f1da9fe..29cff90096ad 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -176,8 +176,12 @@ config MD_FAULTY source "drivers/md/bcache/Kconfig" +config BLK_DEV_DM_BUILTIN + boolean + config BLK_DEV_DM tristate "Device mapper support" + select BLK_DEV_DM_BUILTIN ---help--- Device-mapper is a low level volume manager. It works by allowing people to specify mappings for ranges of logical sectors. Various diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1439fd4ad9b1..3591a7292381 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b42a4ab..6bc016eabdc9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -437,6 +437,7 @@ struct bcache_device { /* If nonzero, we're detaching/unregistering from cache set */ atomic_t detaching; + int flush_done; atomic_long_t sectors_dirty; unsigned long sectors_dirty_gc; @@ -498,7 +499,7 @@ struct cached_dev { */ atomic_t has_dirty; - struct ratelimit writeback_rate; + struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; /* @@ -507,10 +508,9 @@ struct cached_dev { */ sector_t last_read; - /* Number of writeback bios in flight */ - atomic_t in_flight; + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; struct closure_with_timer writeback; - struct closure_waitlist writeback_wait; struct keybuf writeback_keys; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index cb4578a327b9..1b27cbd822e1 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -918,29 +918,59 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) /* Mergesort */ -static void btree_sort_fixup(struct btree_iter *iter) +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { while (iter->used > 1) { struct btree_iter_set *top = iter->data, *i = top + 1; - struct bkey *k; if (iter->used > 2 && btree_iter_cmp(i[0], i[1])) i++; - for (k = i->k; - k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; - k = bkey_next(k)) - if (top->k > i->k) - __bch_cut_front(top->k, k); - else if (KEY_SIZE(k)) - bch_cut_back(&START_KEY(k), top->k); - - if (top->k < i->k || k == i->k) + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) break; - heap_sift(iter, i - top, btree_iter_cmp); + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, btree_iter_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, btree_iter_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, btree_iter_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } } + + return NULL; } static void btree_mergesort(struct btree *b, struct bset *out, @@ -948,15 +978,20 @@ static void btree_mergesort(struct btree *b, struct bset *out, bool fixup, bool remove_stale) { struct bkey *k, *last = NULL; + BKEY_PADDED(k) tmp; bool (*bad)(struct btree *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; while (!btree_iter_end(iter)) { if (fixup && !b->level) - btree_sort_fixup(iter); + k = btree_sort_fixup(iter, &tmp.k); + else + k = NULL; + + if (!k) + k = bch_btree_iter_next(iter); - k = bch_btree_iter_next(iter); if (bad(b, k)) continue; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..7d3deab11fce 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -326,10 +326,25 @@ static void do_btree_write(struct btree *b) i->csum = btree_csum_set(b, i); btree_bio_init(b); - b->bio->bi_rw = REQ_META|WRITE_SYNC; + b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); @@ -618,7 +633,7 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) return mca_can_free(c) * c->btree_pages; /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_WAIT) + if (sc->gfp_mask & __GFP_IO) mutex_lock(&c->bucket_lock); else if (!mutex_trylock(&c->bucket_lock)) return -1; @@ -1419,8 +1434,10 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { b->gc_gen = b->gen; - if (!atomic_read(&b->pin)) + if (!atomic_read(&b->pin)) { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_SECTORS_USED(b, 0); + } } for (d = c->devices; @@ -2140,6 +2157,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) void bch_btree_set_root(struct btree *b) { unsigned i; + struct closure cl; + + closure_init_stack(&cl); BUG_ON(!b->written); @@ -2153,8 +2173,9 @@ void bch_btree_set_root(struct btree *b) b->c->root = b; __bkey_put(b->c, &b->key); - bch_journal_meta(b->c, NULL); + bch_journal_meta(b->c, &cl); pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); + closure_sync(&cl); } /* Cache lookup */ diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index bd05a9a8c7cf..9aba2017f0d1 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) } else { struct closure *parent = cl->parent; struct closure_waitlist *wait = closure_waitlist(cl); + closure_fn *destructor = cl->fn; closure_debug_destroy(cl); + smp_mb(); atomic_set(&cl->remaining, -1); if (wait) closure_wake_up(wait); - if (cl->fn) - cl->fn(cl); + if (destructor) + destructor(cl); if (parent) closure_put(parent); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..d285cd49104c 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -97,6 +97,8 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, if (bio->bi_rw & REQ_DISCARD) { ret = bio_alloc_bioset(gfp, 1, bs); + if (!ret) + return NULL; idx = 0; goto out; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..151a4ab90dd4 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -151,7 +151,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); pr_debug("%u journal buckets", ca->sb.njournal_buckets); - /* Read journal buckets ordered by golden ratio hash to quickly + /* + * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries */ for (i = 0; i < ca->sb.njournal_buckets; i++) { @@ -164,36 +165,45 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, goto bsearch; } - /* If that fails, check all the buckets we haven't checked + /* + * If that fails, check all the buckets we haven't checked * already */ pr_debug("falling back to linear search"); - for (l = 0; l < ca->sb.njournal_buckets; l++) { - if (test_bit(l, bitmap)) - continue; - + for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); + l < ca->sb.njournal_buckets; + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) if (read_bucket(l)) goto bsearch; - } + + if (list_empty(list)) + continue; bsearch: /* Binary search */ m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; + m = (l + r) >> 1; + read_bucket(m); - if (read_bucket(m)) + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) l = m; else r = m; } - /* Read buckets in reverse order until we stop finding more + /* + * Read buckets in reverse order until we stop finding more * journal entries */ - pr_debug("finishing up"); + pr_debug("finishing up: m %u njournal_buckets %u", + m, ca->sb.njournal_buckets); l = m; while (1) { @@ -221,9 +231,10 @@ bsearch: } } - c->journal.seq = list_entry(list->prev, - struct journal_replay, - list)->j.seq; + if (!list_empty(list)) + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; return 0; #undef read_bucket @@ -420,7 +431,7 @@ static void do_journal_discard(struct cache *ca) return; } - switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { + switch (atomic_read(&ja->discard_in_flight)) { case DISCARD_IN_FLIGHT: return; @@ -617,7 +628,7 @@ static void journal_write_unlocked(struct closure *cl) bio_reset(bio); bio->bi_sector = PTR_OFFSET(k, i); bio->bi_bdev = ca->bdev; - bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; + bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; bio->bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; @@ -681,6 +692,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl) if (cl) BUG_ON(!closure_wait(&w->wait, cl)); + closure_flush(&c->journal.io); __journal_try_write(c, true); } } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..a30a0f8a41c0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -489,6 +489,12 @@ static void bch_insert_data_loop(struct closure *cl) bch_queue_gc(op->c); } + /* + * Journal writes are marked REQ_FLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); + do { unsigned i; struct bkey *k; @@ -716,7 +722,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->task = current; s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; + s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; s->recoverable = 1; s->start_time = jiffies; @@ -1047,9 +1053,22 @@ static void request_write(struct cached_dev *dc, struct search *s) trace_bcache_writethrough(s->orig_bio); closure_bio_submit(bio, cl, s->d); } else { - s->op.cache_bio = bio; trace_bcache_writeback(s->orig_bio); bch_writeback_add(dc, bio_sectors(bio)); + s->op.cache_bio = bio; + + if (bio->bi_rw & REQ_FLUSH) { + /* Also need to send a flush to the backing device */ + struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, + dc->disk.bio_split); + + flush->bi_rw = WRITE_FLUSH; + flush->bi_bdev = bio->bi_bdev; + flush->bi_end_io = request_endio; + flush->bi_private = cl; + + closure_bio_submit(flush, cl, s->d); + } } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b653a3f..b4713cea1913 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -704,7 +704,8 @@ static void bcache_device_detach(struct bcache_device *d) atomic_set(&d->detaching, 0); } - bcache_device_unlink(d); + if (!d->flush_done) + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -781,6 +782,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); + blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + return 0; } @@ -1014,6 +1017,14 @@ static void cached_dev_flush(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); struct bcache_device *d = &dc->disk; + mutex_lock(&bch_register_lock); + d->flush_done = 1; + + if (d->c) + bcache_device_unlink(d); + + mutex_unlock(&bch_register_lock); + bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); @@ -1303,18 +1314,22 @@ static void cache_set_flush(struct closure *cl) static void __cache_set_unregister(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cached_dev *dc, *t; + struct cached_dev *dc; size_t i; mutex_lock(&bch_register_lock); - if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) - list_for_each_entry_safe(dc, t, &c->cached_devs, list) - bch_cached_dev_detach(dc); - for (i = 0; i < c->nr_uuids; i++) - if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) - bcache_device_stop(c->devices[i]); + if (c->devices[i]) { + if (!UUID_FLASH_ONLY(&c->uuids[i]) && + test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { + dc = container_of(c->devices[i], + struct cached_dev, disk); + bch_cached_dev_detach(dc); + } else { + bcache_device_stop(c->devices[i]); + } + } mutex_unlock(&bch_register_lock); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4d9cca47e4c6..e9bd6c0cca5b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -214,7 +214,13 @@ STORE(__cached_dev) } if (attr == &sysfs_label) { - memcpy(dc->sb.label, buf, SB_LABEL_SIZE); + if (size > SB_LABEL_SIZE) + return -EINVAL; + memcpy(dc->sb.label, buf, size); + if (size < SB_LABEL_SIZE) + dc->sb.label[size] = '\0'; + if (size && dc->sb.label[size - 1] == '\n') + dc->sb.label[size - 1] = '\0'; bch_write_bdev_super(dc, NULL); if (dc->disk.c) { memcpy(dc->disk.c->uuids[dc->disk.id].label, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index da3a99e85b1e..38a43f8d36ec 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) stats->last = now ?: 1; } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done) +/** + * bch_next_delay() - increment @d by the amount of work done, and return how + * long to delay until the next time to do some work. + * + * @d - the struct bch_ratelimit to update + * @done - the amount of work done, in arbitrary units + * + * Returns the amount of time to delay by, in jiffies + */ +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e38c3a..43fd78affcea 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequency_units) (ewma) >> factor; \ }) -struct ratelimit { +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ uint64_t next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to bch_next_delay() + */ unsigned rate; }; -static inline void ratelimit_reset(struct ratelimit *d) +static inline void bch_ratelimit_reset(struct bch_ratelimit *d) { d->next = local_clock(); } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done); +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); #define __DIV_SAFE(n, d, zero) \ ({ \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..841f0490d4ef 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -91,11 +91,15 @@ static void update_writeback_rate(struct work_struct *work) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { + uint64_t ret; + if (atomic_read(&dc->disk.detaching) || !dc->writeback_percent) return 0; - return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + + return min_t(uint64_t, ret, HZ); } /* Background writeback */ @@ -165,7 +169,7 @@ static void refill_dirty(struct closure *cl) up_write(&dc->writeback_lock); - ratelimit_reset(&dc->writeback_rate); + bch_ratelimit_reset(&dc->writeback_rate); /* Punt to workqueue only so we don't recurse and blow the stack */ continue_at(cl, read_dirty, dirty_wq); @@ -246,9 +250,7 @@ static void write_dirty_finish(struct closure *cl) } bch_keybuf_del(&dc->writeback_keys, w); - atomic_dec_bug(&dc->in_flight); - - closure_wake_up(&dc->writeback_wait); + up(&dc->in_flight); closure_return_with_destructor(cl, dirty_io_destructor); } @@ -278,7 +280,7 @@ static void write_dirty(struct closure *cl) trace_bcache_write_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty_finish, dirty_wq); + continue_at(cl, write_dirty_finish, system_wq); } static void read_dirty_endio(struct bio *bio, int error) @@ -299,7 +301,7 @@ static void read_dirty_submit(struct closure *cl) trace_bcache_read_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty, dirty_wq); + continue_at(cl, write_dirty, system_wq); } static void read_dirty(struct closure *cl) @@ -324,12 +326,8 @@ static void read_dirty(struct closure *cl) if (delay > 0 && (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) { - w->private = NULL; - - closure_delay(&dc->writeback, delay); - continue_at(cl, read_dirty, dirty_wq); - } + jiffies_to_msecs(delay) > 50)) + delay = schedule_timeout_uninterruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -354,15 +352,10 @@ static void read_dirty(struct closure *cl) pr_debug("%s", pkey(&w->key)); - closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); + down(&dc->in_flight); + closure_call(&io->cl, read_dirty_submit, NULL, cl); delay = writeback_delay(dc, KEY_SIZE(&w->key)); - - atomic_inc(&dc->in_flight); - - if (!closure_wait_event(&dc->writeback_wait, cl, - atomic_read(&dc->in_flight) < 64)) - continue_at(cl, read_dirty, dirty_wq); } if (0) { @@ -372,11 +365,16 @@ err: bch_keybuf_del(&dc->writeback_keys, w); } - refill_dirty(cl); + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + continue_at(cl, refill_dirty, dirty_wq); } void bch_cached_dev_writeback_init(struct cached_dev *dc) { + sema_init(&dc->in_flight, 64); closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); @@ -406,7 +404,7 @@ void bch_writeback_exit(void) int __init bch_writeback_init(void) { - dirty_wq = create_singlethread_workqueue("bcache_writeback"); + dirty_wq = create_workqueue("bcache_writeback"); if (!dirty_wq) return -ENOMEM; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0387e05cdb98..a6e985fcceb8 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1660,6 +1660,11 @@ static int __init dm_bufio_init(void) { __u64 mem; + dm_bufio_allocated_kmem_cache = 0; + dm_bufio_allocated_get_free_pages = 0; + dm_bufio_allocated_vmalloc = 0; + dm_bufio_current_allocated = 0; + memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c new file mode 100644 index 000000000000..6c9049c51b2b --- /dev/null +++ b/drivers/md/dm-builtin.c @@ -0,0 +1,48 @@ +#include "dm.h" + +/* + * The kobject release method must not be placed in the module itself, + * otherwise we are subject to module unload races. + * + * The release method is called when the last reference to the kobject is + * dropped. It may be called by any other kernel code that drops the last + * reference. + * + * The release method suffers from module unload race. We may prevent the + * module from being unloaded at the start of the release method (using + * increased module reference count or synchronizing against the release + * method), however there is no way to prevent the module from being + * unloaded at the end of the release method. + * + * If this code were placed in the dm module, the following race may + * happen: + * 1. Some other process takes a reference to dm kobject + * 2. The user issues ioctl function to unload the dm device + * 3. dm_sysfs_exit calls kobject_put, however the object is not released + * because of the other reference taken at step 1 + * 4. dm_sysfs_exit waits on the completion + * 5. The other process that took the reference in step 1 drops it, + * dm_kobject_release is called from this process + * 6. dm_kobject_release calls complete() + * 7. a reschedule happens before dm_kobject_release returns + * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference + * count is decremented + * 9. The user unloads the dm module + * 10. The other process that was rescheduled in step 7 continues to run, + * it is now executing code in unloaded module, so it crashes + * + * Note that if the process that takes the foreign reference to dm kobject + * has a low priority and the system is sufficiently loaded with + * higher-priority processes that prevent the low-priority process from + * being scheduled long enough, this bug may really happen. + * + * In order to fix this module unload race, we place the release method + * into a helper code that is compiled directly into the kernel. + */ + +void dm_kobject_release(struct kobject *kobj) +{ + complete(dm_get_completion_from_kobject(kobj)); +} + +EXPORT_SYMBOL(dm_kobject_release); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index df44b60e66f2..516f9c922bb2 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -151,6 +151,9 @@ struct cache { atomic_t nr_migrations; wait_queue_head_t migration_wait; + wait_queue_head_t quiescing_wait; + atomic_t quiescing_ack; + /* * cache_size entries, dirty if set */ @@ -742,8 +745,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, static void cleanup_migration(struct dm_cache_migration *mg) { - dec_nr_migrations(mg->cache); + struct cache *cache = mg->cache; free_migration(mg); + dec_nr_migrations(cache); } static void migration_failure(struct dm_cache_migration *mg) @@ -1340,34 +1344,51 @@ static void writeback_some_dirty_blocks(struct cache *cache) /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ -static void start_quiescing(struct cache *cache) +static bool is_quiescing(struct cache *cache) { + int r; unsigned long flags; spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = 1; + r = cache->quiescing; spin_unlock_irqrestore(&cache->lock, flags); + + return r; } -static void stop_quiescing(struct cache *cache) +static void ack_quiescing(struct cache *cache) +{ + if (is_quiescing(cache)) { + atomic_inc(&cache->quiescing_ack); + wake_up(&cache->quiescing_wait); + } +} + +static void wait_for_quiescing_ack(struct cache *cache) +{ + wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); +} + +static void start_quiescing(struct cache *cache) { unsigned long flags; spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = 0; + cache->quiescing = true; spin_unlock_irqrestore(&cache->lock, flags); + + wait_for_quiescing_ack(cache); } -static bool is_quiescing(struct cache *cache) +static void stop_quiescing(struct cache *cache) { - int r; unsigned long flags; spin_lock_irqsave(&cache->lock, flags); - r = cache->quiescing; + cache->quiescing = false; spin_unlock_irqrestore(&cache->lock, flags); - return r; + atomic_set(&cache->quiescing_ack, 0); } static void wait_for_migrations(struct cache *cache) @@ -1414,16 +1435,15 @@ static void do_worker(struct work_struct *ws) struct cache *cache = container_of(ws, struct cache, worker); do { - if (!is_quiescing(cache)) + if (!is_quiescing(cache)) { + writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); process_deferred_bios(cache); + } process_migrations(cache, &cache->quiesced_migrations, issue_copy); process_migrations(cache, &cache->completed_migrations, complete_migration); - writeback_some_dirty_blocks(cache); - - process_deferred_writethrough_bios(cache); - if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1436,6 +1456,9 @@ static void do_worker(struct work_struct *ws) process_migrations(cache, &cache->need_commit_migrations, migration_success_post_commit); } + + ack_quiescing(cache); + } while (more_work(cache)); } @@ -1998,6 +2021,9 @@ static int cache_create(struct cache_args *ca, struct cache **result) atomic_set(&cache->nr_migrations, 0); init_waitqueue_head(&cache->migration_wait); + init_waitqueue_head(&cache->quiescing_wait); + atomic_set(&cache->quiescing_ack, 0); + r = -ENOMEM; cache->nr_dirty = 0; cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 496d5f3646a5..2f91d6d4a2cc 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -20,6 +20,7 @@ struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; + struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; atomic_t may_delay; @@ -45,14 +46,13 @@ struct dm_delay_info { static DEFINE_MUTEX(delayed_bios_lock); -static struct workqueue_struct *kdelayd_wq; static struct kmem_cache *delayed_cache; static void handle_delayed_timer(unsigned long data) { struct delay_c *dc = (struct delay_c *)data; - queue_work(kdelayd_wq, &dc->flush_expired_bios); + queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } static void queue_timeout(struct delay_c *dc, unsigned long expires) @@ -191,6 +191,12 @@ out: goto bad_dev_write; } + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; + } + setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); @@ -203,6 +209,8 @@ out: ti->private = dc; return 0; +bad_queue: + mempool_destroy(dc->delayed_pool); bad_dev_write: if (dc->dev_write) dm_put_device(ti, dc->dev_write); @@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - flush_workqueue(kdelayd_wq); + destroy_workqueue(dc->kdelayd_wq); dm_put_device(ti, dc->dev_read); @@ -350,12 +358,6 @@ static int __init dm_delay_init(void) { int r = -ENOMEM; - kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); - if (!kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - delayed_cache = KMEM_CACHE(dm_delay_info, 0); if (!delayed_cache) { DMERR("Couldn't create delayed bio cache."); @@ -373,8 +375,6 @@ static int __init dm_delay_init(void) bad_register: kmem_cache_destroy(delayed_cache); bad_memcache: - destroy_workqueue(kdelayd_wq); -bad_queue: return r; } @@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void) { dm_unregister_target(&delay_target); kmem_cache_destroy(delayed_cache); - destroy_workqueue(kdelayd_wq); } /* Module hooks */ diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index aa04f0224642..81a79b739e97 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1644,7 +1644,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern } if (!dmi) { + unsigned noio_flag; + noio_flag = memalloc_noio_save(); dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); + memalloc_noio_restore(noio_flag); if (dmi) *param_flags |= DM_PARAMS_VMALLOC; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bdf26f5bd326..0ba21b0f3972 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -86,6 +86,7 @@ struct multipath { unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ + unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ @@ -497,7 +498,8 @@ static void process_queued_ios(struct work_struct *work) (!pgpath && !m->queue_if_no_path)) must_queue = 0; - if (m->pg_init_required && !m->pg_init_in_progress && pgpath) + if (m->pg_init_required && !m->pg_init_in_progress && pgpath && + !m->pg_init_disabled) __pg_init_all_paths(m); spin_unlock_irqrestore(&m->lock, flags); @@ -942,10 +944,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 1; + spin_unlock_irqrestore(&m->lock, flags); + flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); flush_work(&m->trigger_event); + + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 0; + spin_unlock_irqrestore(&m->lock, flags); } static void multipath_dtr(struct dm_target *ti) @@ -1164,7 +1176,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (m->pg_init_count <= m->pg_init_retries) + if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) m->pg_init_required = 1; else limit_reached = 1; @@ -1284,8 +1296,17 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ) + if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ) { + if ((clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors) { + struct queue_limits *limits; + + /* device doesn't really support WRITE SAME, disable it */ + limits = dm_get_queue_limits(dm_table_get_md(m->ti->table)); + limits->max_write_same_sectors = 0; + } return error; + } if (mpio->pgpath) fail_path(mpio->pgpath); @@ -1561,7 +1582,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long flags; int r; -again: bdev = NULL; mode = 0; r = 0; @@ -1579,7 +1599,7 @@ again: } if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) - r = -EAGAIN; + r = -ENOTCONN; else if (!bdev) r = -EIO; @@ -1588,14 +1608,14 @@ again: /* * Only pass ioctls through if the device sizes match exactly. */ - if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); + if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) { + int err = scsi_verify_blk_ioctl(NULL, cmd); + if (err) + r = err; + } - if (r == -EAGAIN && !fatal_signal_pending(current)) { + if (r == -ENOTCONN && !fatal_signal_pending(current)) queue_work(kmultipathd, &m->process_queued_ios); - msleep(10); - goto again; - } return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); } @@ -1694,7 +1714,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 5, 1}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1d3fe1a40a9b..2dea49c4279e 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -380,7 +380,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) static int validate_raid_redundancy(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; - unsigned rebuilds_per_group, copies, d; + unsigned rebuilds_per_group = 0, copies, d; unsigned group_size, last_group_start; for (i = 0; i < rs->md.raid_disks; i++) diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 3ac415675b6c..2d2b1b7588d7 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); - flush_work(&req.work); + flush_workqueue(ps->metadata_wq); return req.result; } @@ -269,6 +269,14 @@ static chunk_t area_location(struct pstore *ps, chunk_t area) return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); } +static void skip_metadata(struct pstore *ps) +{ + uint32_t stride = ps->exceptions_per_area + 1; + chunk_t next_free = ps->next_free; + if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS) + ps->next_free++; +} + /* * Read or write a metadata area. Remembering to skip the first * chunk which holds the header. @@ -502,6 +510,8 @@ static int read_exceptions(struct pstore *ps, ps->current_area--; + skip_metadata(ps); + return 0; } @@ -616,8 +626,6 @@ static int persistent_prepare_exception(struct dm_exception_store *store, struct dm_exception *e) { struct pstore *ps = get_info(store); - uint32_t stride; - chunk_t next_free; sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); /* Is there enough room ? */ @@ -630,10 +638,8 @@ static int persistent_prepare_exception(struct dm_exception_store *store, * Move onto the next free pending, making sure to take * into account the location of the metadata chunks. */ - stride = (ps->exceptions_per_area + 1); - next_free = ++ps->next_free; - if (sector_div(next_free, stride) == 1) - ps->next_free++; + ps->next_free++; + skip_metadata(ps); atomic_inc(&ps->pending_count); return 0; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c434e5aab2df..944690bafd93 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -66,6 +66,18 @@ struct dm_snapshot { atomic_t pending_exceptions_count; + /* Protected by "lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; + + /* + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. + */ + struct list_head out_of_order_list; + mempool_t *pending_pool; struct dm_exception_table pending; @@ -173,6 +185,14 @@ struct dm_snap_pending_exception { */ int started; + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct list_head out_of_order_entry; + /* * For writing a complete chunk, bypassing the copy. */ @@ -725,17 +745,16 @@ static int calc_max_buckets(void) */ static int init_hash_tables(struct dm_snapshot *s) { - sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + sector_t hash_size, cow_dev_size, max_buckets; /* * Calculate based on the size of the original volume or * the COW volume... */ cow_dev_size = get_dev_size(s->cow->bdev); - origin_dev_size = get_dev_size(s->origin->bdev); max_buckets = calc_max_buckets(); - hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; + hash_size = cow_dev_size >> s->store->chunk_shift; hash_size = min(hash_size, max_buckets); if (hash_size < 64) @@ -1095,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + INIT_LIST_HEAD(&s->out_of_order_list); init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); @@ -1444,6 +1466,19 @@ static void commit_callback(void *context, int success) pending_complete(pe, success); } +static void complete_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + if (unlikely(pe->copy_error)) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); +} + /* * Called when the copy I/O has finished. kcopyd actually runs * this code so don't block. @@ -1453,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; - if (read_err || write_err) - pending_complete(pe, 0); + pe->copy_error = read_err || write_err; - else - /* Update the metadata if we are persistent */ - s->store->type->commit_exception(s->store, &pe->e, - commit_callback, pe); + if (pe->exception_sequence == s->exception_complete_sequence) { + s->exception_complete_sequence++; + complete_exception(pe); + + while (!list_empty(&s->out_of_order_list)) { + pe = list_entry(s->out_of_order_list.next, + struct dm_snap_pending_exception, out_of_order_entry); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + s->exception_complete_sequence++; + list_del(&pe->out_of_order_entry); + complete_exception(pe); + } + } else { + struct list_head *lh; + struct dm_snap_pending_exception *pe2; + + list_for_each_prev(lh, &s->out_of_order_list) { + pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); + if (pe2->exception_sequence < pe->exception_sequence) + break; + } + list_add(&pe->out_of_order_entry, lh); + } } /* @@ -1554,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s, return NULL; } + pe->exception_sequence = s->exception_start_sequence++; + dm_insert_exception(&s->pending, &pe->e); return pe; @@ -2193,7 +2249,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 11, 1}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index 84d2b91e4efb..c62c5ab6aed5 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -86,6 +86,7 @@ static const struct sysfs_ops dm_sysfs_ops = { static struct kobj_type dm_ktype = { .sysfs_ops = &dm_sysfs_ops, .default_attrs = dm_attrs, + .release = dm_kobject_release, }; /* @@ -104,5 +105,7 @@ int dm_sysfs_init(struct mapped_device *md) */ void dm_sysfs_exit(struct mapped_device *md) { - kobject_put(dm_kobject(md)); + struct kobject *kobj = dm_kobject(md); + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1ff252ab7d46..bd88d3dade1e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -215,6 +215,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + if (!num_targets) { + kfree(t); + return -ENOMEM; + } + if (alloc_targets(t, num_targets)) { kfree(t); return -ENOMEM; @@ -580,14 +585,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) /* * Used to dynamically allocate the arg array. + * + * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must + * process messages even if some device is suspended. These messages have a + * small fixed number of arguments. + * + * On the other hand, dm-switch needs to process bulk data using messages and + * excessive use of GFP_NOIO could cause trouble. */ static char **realloc_argv(unsigned *array_size, char **old_argv) { char **argv; unsigned new_size; + gfp_t gfp; - new_size = *array_size ? *array_size * 2 : 64; - argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (*array_size) { + new_size = *array_size * 2; + gfp = GFP_KERNEL; + } else { + new_size = 8; + gfp = GFP_NOIO; + } + argv = kmalloc(new_size * sizeof(*argv), gfp); if (argv) { memcpy(argv, old_argv, *array_size * sizeof(*argv)); *array_size = new_size; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 60bce435f4fa..5f49d704f275 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1349,6 +1349,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) return td->id; } +/* + * Check whether @time (of block creation) is older than @td's last snapshot. + * If so then the associated block is shared with the last snapshot device. + * Any block on a device created *after* the device last got snapshotted is + * necessarily not shared. + */ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) { return td->snapshotted_time > time; @@ -1458,6 +1464,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) return r; } +int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) +{ + int r; + uint32_t ref_count; + + down_read(&pmd->root_lock); + r = dm_sm_get_count(pmd->data_sm, b, &ref_count); + if (!r) + *result = (ref_count != 0); + up_read(&pmd->root_lock); + + return r; +} + bool dm_thin_changed_this_transaction(struct dm_thin_device *td) { int r; @@ -1469,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td) return r; } +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) +{ + bool r = false; + struct dm_thin_device *td, *tmp; + + down_read(&pmd->root_lock); + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (td->changed) { + r = td->changed; + break; + } + } + up_read(&pmd->root_lock); + + return r; +} + bool dm_thin_aborted_changes(struct dm_thin_device *td) { bool r; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 845ebbe589a9..8f4d62baf09b 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -161,6 +161,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); */ bool dm_thin_changed_this_transaction(struct dm_thin_device *td); +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd); + bool dm_thin_aborted_changes(struct dm_thin_device *td); int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, @@ -181,6 +183,8 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result); int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); +int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); + /* * Returns -ENOSPC if the new size is too small and already allocated * blocks would be lost. diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 88f2f802d528..901aac27e522 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -512,6 +512,7 @@ struct dm_thin_new_mapping { unsigned quiesced:1; unsigned prepared:1; unsigned pass_discard:1; + unsigned definitely_not_shared:1; struct thin_c *tc; dm_block_t virt_block; @@ -640,7 +641,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); if (r) { - DMERR_LIMIT("dm_thin_insert_block() failed"); + DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", + dm_device_name(pool->pool_md), r); + set_pool_mode(pool, PM_READ_ONLY); cell_error(pool, m->cell); goto out; } @@ -681,7 +684,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) cell_defer_no_holder(tc, m->cell2); if (m->pass_discard) - remap_and_issue(tc, m->bio, m->data_block); + if (m->definitely_not_shared) + remap_and_issue(tc, m->bio, m->data_block); + else { + bool used = false; + if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used) + bio_endio(m->bio, 0); + else + remap_and_issue(tc, m->bio, m->data_block); + } else bio_endio(m->bio, 0); @@ -749,13 +760,17 @@ static int ensure_next_mapping(struct pool *pool) static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) { - struct dm_thin_new_mapping *r = pool->next_mapping; + struct dm_thin_new_mapping *m = pool->next_mapping; BUG_ON(!pool->next_mapping); + memset(m, 0, sizeof(struct dm_thin_new_mapping)); + INIT_LIST_HEAD(&m->list); + m->bio = NULL; + pool->next_mapping = NULL; - return r; + return m; } static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, @@ -767,15 +782,10 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); - INIT_LIST_HEAD(&m->list); - m->quiesced = 0; - m->prepared = 0; m->tc = tc; m->virt_block = virt_block; m->data_block = data_dest; m->cell = cell; - m->err = 0; - m->bio = NULL; if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) m->quiesced = 1; @@ -838,15 +848,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); - INIT_LIST_HEAD(&m->list); m->quiesced = 1; m->prepared = 0; m->tc = tc; m->virt_block = virt_block; m->data_block = data_block; m->cell = cell; - m->err = 0; - m->bio = NULL; /* * If the whole block of data is being overwritten or we are not @@ -1030,12 +1037,12 @@ static void process_discard(struct thin_c *tc, struct bio *bio) */ m = get_next_mapping(pool); m->tc = tc; - m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; + m->pass_discard = pool->pf.discard_passdown; + m->definitely_not_shared = !lookup_result.shared; m->virt_block = block; m->data_block = lookup_result.block; m->cell = cell; m->cell2 = cell2; - m->err = 0; m->bio = bio; if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { @@ -1337,7 +1344,8 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&pool->deferred_flush_bios); spin_unlock_irqrestore(&pool->lock, flags); - if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) + if (bio_list_empty(&bios) && + !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) return; if (commit_or_fallback(pool)) { @@ -2776,6 +2784,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (get_pool_mode(tc->pool) == PM_FAIL) { ti->error = "Couldn't open thin device, Pool is in fail mode"; + r = -EINVAL; goto bad_thin_open; } @@ -2787,7 +2796,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); if (r) - goto bad_thin_open; + goto bad_target_max_io_len; ti->num_flush_bios = 1; ti->flush_supported = true; @@ -2808,6 +2817,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) return 0; +bad_target_max_io_len: + dm_pool_close_thin_device(tc->td); bad_thin_open: __pool_dec(tc->pool); bad_pool_lookup: diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index b948fd864d45..0d2e812d424b 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -831,9 +831,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) for (i = v->levels - 1; i >= 0; i--) { sector_t s; v->hash_level_block[i] = hash_position; - s = verity_position_at_level(v, v->data_blocks, i); - s = (s >> v->hash_per_block_bits) + - !!(s & ((1 << v->hash_per_block_bits) - 1)); + s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1) + >> ((i + 1) * v->hash_per_block_bits); if (hash_position + s < hash_position) { ti->error = "Hash device offset overflow"; r = -E2BIG; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d5370a94b2c1..204a59fd872f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -184,8 +184,8 @@ struct mapped_device { /* forced geometry settings */ struct hd_geometry geometry; - /* sysfs handle */ - struct kobject kobj; + /* kobject and completion */ + struct dm_kobject_holder kobj_holder; /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; @@ -386,10 +386,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_table *map = dm_get_live_table(md); + struct dm_table *map; struct dm_target *tgt; int r = -ENOTTY; +retry: + map = dm_get_live_table(md); if (!map || !dm_table_get_size(map)) goto out; @@ -410,6 +412,11 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, out: dm_table_put(map); + if (r == -ENOTCONN) { + msleep(10); + goto retry; + } + return r; } @@ -1897,6 +1904,7 @@ static struct mapped_device *alloc_dev(int minor) init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); + init_completion(&md->kobj_holder.completion); md->disk->major = _major; md->disk->first_minor = minor; @@ -2212,6 +2220,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md) } /* + * The queue_limits are only valid as long as you have a reference + * count on 'md'. + */ +struct queue_limits *dm_get_queue_limits(struct mapped_device *md) +{ + BUG_ON(!atomic_read(&md->holders)); + return &md->queue->limits; +} +EXPORT_SYMBOL_GPL(dm_get_queue_limits); + +/* * Fully initialize a request-based queue (->elevator, ->request_fn, etc). */ static int dm_init_request_based_queue(struct mapped_device *md) @@ -2717,20 +2736,14 @@ struct gendisk *dm_disk(struct mapped_device *md) struct kobject *dm_kobject(struct mapped_device *md) { - return &md->kobj; + return &md->kobj_holder.kobj; } -/* - * struct mapped_device should not be exported outside of dm.c - * so use this check to verify that kobj is part of md structure - */ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) { struct mapped_device *md; - md = container_of(kobj, struct mapped_device, kobj); - if (&md->kobj != kobj) - return NULL; + md = container_of(kobj, struct mapped_device, kobj_holder.kobj); if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 45b97da1bd06..9b3222f44835 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -15,6 +15,8 @@ #include <linux/list.h> #include <linux/blkdev.h> #include <linux/hdreg.h> +#include <linux/completion.h> +#include <linux/kobject.h> /* * Suspend feature flags @@ -125,12 +127,27 @@ void dm_interface_exit(void); /* * sysfs interface */ +struct dm_kobject_holder { + struct kobject kobj; + struct completion completion; +}; + +static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) +{ + return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; +} + int dm_sysfs_init(struct mapped_device *md); void dm_sysfs_exit(struct mapped_device *md); struct kobject *dm_kobject(struct mapped_device *md); struct mapped_device *dm_get_from_kobject(struct kobject *kobj); /* + * The kobject helper + */ +void dm_kobject_release(struct kobject *kobj); + +/* * Targets for linear and striped mappings */ int dm_linear_init(void); diff --git a/drivers/md/md.c b/drivers/md/md.c index 9b82377a833b..a2dda416c9cb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1118,6 +1118,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1196,6 +1197,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -1604,6 +1607,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); if (mddev->raid_disks == 0) { @@ -1686,6 +1690,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -2829,6 +2835,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) else rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers-> hot_add_disk(rdev->mddev, rdev); if (err) { @@ -3619,6 +3626,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } + blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev_resume(mddev); @@ -5760,6 +5768,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; } else @@ -7693,24 +7702,11 @@ static int remove_and_add_spares(struct mddev *mddev, if (test_bit(Faulty, &rdev->flags)) continue; if (mddev->ro && - rdev->saved_raid_disk < 0) + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) continue; rdev->recovery_offset = 0; - if (rdev->saved_raid_disk >= 0 && mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); - if (mddev->in_sync) - /* OK, this device, which is in_sync, - * will definitely be noticed before - * the next write, so recovery isn't - * needed. - */ - rdev->recovery_offset = mddev->recovery_cp; - spin_unlock_irq(&mddev->write_lock); - } - if (mddev->ro && rdev->recovery_offset != MaxSector) - /* not safe to add this disk now */ - continue; if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) @@ -7788,9 +7784,12 @@ void md_check_recovery(struct mddev *mddev) * As we only add devices that are already in-sync, * we can activate the spares immediately. */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); remove_and_add_spares(mddev, NULL); - mddev->pers->spare_active(mddev); + /* There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk + */ + md_reap_sync_thread(mddev); + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@ -8086,6 +8085,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, u64 *p; int lo, hi; int rv = 1; + unsigned long flags; if (bb->shift < 0) /* badblocks are disabled */ @@ -8100,7 +8100,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, sectors = next - s; } - write_seqlock_irq(&bb->lock); + write_seqlock_irqsave(&bb->lock, flags); p = bb->page; lo = 0; @@ -8216,7 +8216,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, bb->changed = 1; if (!acknowledged) bb->unacked_exist = 1; - write_sequnlock_irq(&bb->lock); + write_sequnlock_irqrestore(&bb->lock, flags); return rv; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 653f992b687a..ebe748e57416 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -129,6 +129,9 @@ struct md_rdev { enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ + Bitmap_sync, /* ..actually, not quite In_sync. Need a + * bitmap-based recovery to get fully in sync + */ Unmerged, /* device is being added to array and should * be considerred for bvec_merge_fn but not * yet for actual IO diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 172147eb1d40..1d75b1dc1e2e 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, * The shadow op will often be a noop. Only insert if it really * copied data. */ - if (dm_block_location(*block) != b) + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); r = insert_ablock(info, index, *block, root); + } return r; } @@ -509,15 +517,18 @@ static int grow_add_tail_block(struct resize *resize) static int grow_needs_more_blocks(struct resize *resize) { int r; + unsigned old_nr_blocks = resize->old_nr_full_blocks; if (resize->old_nr_entries_in_last_block > 0) { + old_nr_blocks++; + r = grow_extend_tail_block(resize, resize->max_entries); if (r) return r; } r = insert_full_ablocks(resize->info, resize->size_of_block, - resize->old_nr_full_blocks, + old_nr_blocks, resize->new_nr_full_blocks, resize->max_entries, resize->value, &resize->root); diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 3e7a88d99eb0..0d240373ffab 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) return -EINVAL; } + /* + * We need to set this before the dm_tm_new_block() call below. + */ + ll->nr_blocks = nr_blocks; for (i = old_blocks; i < blocks; i++) { struct dm_block *b; struct disk_index_entry idx; @@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); if (r < 0) return r; + idx.blocknr = cpu_to_le64(dm_block_location(b)); r = dm_tm_unlock(ll->tm, b); @@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) return r; } - ll->nr_blocks = nr_blocks; return 0; } diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 1c959684caef..afb419e514bf 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); int r = sm_metadata_new_block_(sm, b); - if (r) + if (r) { DMERR("unable to allocate new metadata block"); + return r; + } r = sm_metadata_get_nr_free(sm, &count); - if (r) + if (r) { DMERR("couldn't get free block count"); + return r; + } check_threshold(&smm->threshold, count); @@ -604,20 +608,38 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) * Flick into a mode where all blocks get allocated in the new area. */ smm->begin = old_len; - memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); + memcpy(sm, &bootstrap_ops, sizeof(*sm)); /* * Extend. */ r = sm_ll_extend(&smm->ll, extra_blocks); + if (r) + goto out; /* - * Switch back to normal behaviour. + * We repeatedly increment then commit until the commit doesn't + * allocate any new blocks. */ - memcpy(&smm->sm, &ops, sizeof(smm->sm)); - for (i = old_len; !r && i < smm->begin; i++) - r = sm_ll_inc(&smm->ll, i, &ev); + do { + for (i = old_len; !r && i < smm->begin; i++) { + r = sm_ll_inc(&smm->ll, i, &ev); + if (r) + goto out; + } + old_len = smm->begin; + + r = sm_ll_commit(&smm->ll); + if (r) + goto out; + } while (old_len != smm->begin); + +out: + /* + * Switch back to normal behaviour. + */ + memcpy(sm, &ops, sizeof(*sm)); return r; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6e17f8181c4b..e73740b55aea 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1479,6 +1479,7 @@ static int raid1_spare_active(struct mddev *mddev) } } if (rdev + && rdev->recovery_offset == MaxSector && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { count++; @@ -1848,6 +1849,40 @@ static int process_checks(struct r1bio *r1_bio) int i; int vcnt; + /* Fix variable parts of all bios */ + vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); + for (i = 0; i < conf->raid_disks * 2; i++) { + int j; + int size; + int uptodate; + struct bio *b = r1_bio->bios[i]; + if (b->bi_end_io != end_sync_read) + continue; + /* fixup the bio for reuse, but preserve BIO_UPTODATE */ + uptodate = test_bit(BIO_UPTODATE, &b->bi_flags); + bio_reset(b); + if (!uptodate) + clear_bit(BIO_UPTODATE, &b->bi_flags); + b->bi_vcnt = vcnt; + b->bi_size = r1_bio->sectors << 9; + b->bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + b->bi_bdev = conf->mirrors[i].rdev->bdev; + b->bi_end_io = end_sync_read; + b->bi_private = r1_bio; + + size = b->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &b->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + } + } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { @@ -1856,17 +1891,18 @@ static int process_checks(struct r1bio *r1_bio) break; } r1_bio->read_disk = primary; - vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); for (i = 0; i < conf->raid_disks * 2; i++) { int j; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - int size; + int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags); if (sbio->bi_end_io != end_sync_read) continue; + /* Now we can 'fixup' the BIO_UPTODATE flag */ + set_bit(BIO_UPTODATE, &sbio->bi_flags); - if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + if (uptodate) { for (j = vcnt; j-- ; ) { struct page *p, *s; p = pbio->bi_io_vec[j].bv_page; @@ -1881,33 +1917,12 @@ static int process_checks(struct r1bio *r1_bio) if (j >= 0) atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { + && uptodate)) { /* No need to write to this device. */ sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); continue; } - /* fixup the bio for reuse */ - bio_reset(sbio); - sbio->bi_vcnt = vcnt; - sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - sbio->bi_end_io = end_sync_read; - sbio->bi_private = r1_bio; - - size = sbio->bi_size; - for (j = 0; j < vcnt ; j++) { - struct bio_vec *bi; - bi = &sbio->bi_io_vec[j]; - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - } bio_copy_data(sbio, pbio); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6ddae2501b9a..d2f8cd332b4a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1321,7 +1321,7 @@ read_again: /* Could not read all from this device, so we will * need another r10_bio. */ - sectors_handled = (r10_bio->sectors + max_sectors + sectors_handled = (r10_bio->sector + max_sectors - bio->bi_sector); r10_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); @@ -1329,7 +1329,7 @@ read_again: bio->bi_phys_segments = 2; else bio->bi_phys_segments++; - spin_unlock(&conf->device_lock); + spin_unlock_irq(&conf->device_lock); /* Cannot call generic_make_request directly * as that will be queued in __generic_make_request * and subsequent mempool_alloc might block @@ -1762,6 +1762,7 @@ static int raid10_spare_active(struct mddev *mddev) } sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); } else if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; @@ -2075,11 +2076,17 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; */ - for (j = 0; j < vcnt; j++) + int sectors = r10_bio->sectors; + for (j = 0; j < vcnt; j++) { + int len = PAGE_SIZE; + if (sectors < (len / 512)) + len = sectors * 512; if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), page_address(tbio->bi_io_vec[j].bv_page), - fbio->bi_io_vec[j].bv_len)) + len)) break; + sectors -= len/512; + } if (j == vcnt) continue; atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); @@ -2262,12 +2269,18 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[1].devnum; wbio = r10_bio->devs[1].bio; wbio2 = r10_bio->devs[1].repl_bio; + /* Need to test wbio2->bi_end_io before we call + * generic_make_request as if the former is NULL, + * the latter is free to free wbio2. + */ + if (wbio2 && !wbio2->bi_end_io) + wbio2 = NULL; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); generic_make_request(wbio); } - if (wbio2 && wbio2->bi_end_io) { + if (wbio2) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); md_sync_acct(conf->mirrors[d].replacement->bdev, bio_sectors(wbio2)); @@ -2909,14 +2922,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, */ if (mddev->bitmap == NULL && mddev->recovery_cp == MaxSector && + mddev->reshape_position == MaxSector && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; - max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || - test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sector = mddev->resync_max_sectors; - return max_sector - sector_nr; + return mddev->dev_sectors - sector_nr; } skipped: @@ -3186,10 +3198,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (j == conf->copies) { /* Cannot recover, so abort the recovery or * record a bad block */ - put_buf(r10_bio); - if (rb2) - atomic_dec(&rb2->remaining); - r10_bio = rb2; if (any_working) { /* problem is that there are bad blocks * on other device(s) @@ -3221,6 +3229,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, mirror->recovery_disabled = mddev->recovery_disabled; } + put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); + r10_bio = rb2; break; } } @@ -3386,6 +3398,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); + set_bit(BIO_UPTODATE, &bio->bi_flags); generic_make_request(bio); } } @@ -3532,7 +3545,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) /* FIXME calc properly */ conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + - max(0,mddev->delta_disks)), + max(0,-mddev->delta_disks)), GFP_KERNEL); if (!conf->mirrors) goto out; @@ -3691,7 +3704,7 @@ static int run(struct mddev *mddev) conf->geo.far_offset == 0) goto out_free_conf; if (conf->prev.far_copies != 1 && - conf->geo.far_offset == 0) + conf->prev.far_offset == 0) goto out_free_conf; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 05e4a105b9c7..5e3c25d4562c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -668,6 +668,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + bi->bi_vcnt = 0; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); @@ -706,6 +712,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + rbi->bi_vcnt = 0; if (conf->mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), @@ -1881,6 +1893,7 @@ static void raid5_end_write_request(struct bio *bi, int error) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (!uptodate) { + set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -2800,6 +2813,14 @@ static void handle_stripe_clean_event(struct r5conf *conf, } /* now that discard is done we can proceed with any sync */ clear_bit(STRIPE_DISCARD, &sh->state); + /* + * SCSI discard will change some bio fields and the stripe has + * no updated data, so remove it from hash list and the stripe + * will be reinitialized + */ + spin_lock_irq(&conf->device_lock); + remove_hash(sh); + spin_unlock_irq(&conf->device_lock); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) set_bit(STRIPE_HANDLE, &sh->state); @@ -3371,7 +3392,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) */ set_bit(R5_Insync, &dev->flags); - if (rdev && test_bit(R5_WriteError, &dev->flags)) { + if (test_bit(R5_WriteError, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ struct md_rdev *rdev2 = rcu_dereference( @@ -3384,7 +3405,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } else clear_bit(R5_WriteError, &dev->flags); } - if (rdev && test_bit(R5_MadeGood, &dev->flags)) { + if (test_bit(R5_MadeGood, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ struct md_rdev *rdev2 = rcu_dereference( @@ -3462,6 +3483,7 @@ static void handle_stripe(struct stripe_head *sh) test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); + clear_bit(STRIPE_REPLACED, &sh->state); } spin_unlock(&sh->stripe_lock); } @@ -3607,19 +3629,23 @@ static void handle_stripe(struct stripe_head *sh) handle_parity_checks5(conf, sh, &s, disks); } - if (s.replacing && s.locked == 0 - && !test_bit(STRIPE_INSYNC, &sh->state)) { + if ((s.replacing || s.syncing) && s.locked == 0 + && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) + && !test_bit(STRIPE_REPLACED, &sh->state)) { /* Write out to replacement devices where possible */ for (i = 0; i < conf->raid_disks; i++) - if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && - test_bit(R5_NeedReplace, &sh->dev[i].flags)) { + if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { + WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); set_bit(R5_WantReplace, &sh->dev[i].flags); set_bit(R5_LOCKED, &sh->dev[i].flags); s.locked++; } - set_bit(STRIPE_INSYNC, &sh->state); + if (s.replacing) + set_bit(STRIPE_INSYNC, &sh->state); + set_bit(STRIPE_REPLACED, &sh->state); } if ((s.syncing || s.replacing) && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); @@ -5011,23 +5037,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) return sectors * (raid_disks - conf->max_degraded); } +static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + percpu->spare_page = NULL; + percpu->scribble = NULL; +} + +static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + if (conf->level == 6 && !percpu->spare_page) + percpu->spare_page = alloc_page(GFP_KERNEL); + if (!percpu->scribble) + percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + + if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { + free_scratch_buffer(conf, percpu); + return -ENOMEM; + } + + return 0; +} + static void raid5_free_percpu(struct r5conf *conf) { - struct raid5_percpu *percpu; unsigned long cpu; if (!conf->percpu) return; - get_online_cpus(); - for_each_possible_cpu(cpu) { - percpu = per_cpu_ptr(conf->percpu, cpu); - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - } #ifdef CONFIG_HOTPLUG_CPU unregister_cpu_notifier(&conf->cpu_notify); #endif + + get_online_cpus(); + for_each_possible_cpu(cpu) + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); put_online_cpus(); free_percpu(conf->percpu); @@ -5053,15 +5099,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (conf->level == 6 && !percpu->spare_page) - percpu->spare_page = alloc_page(GFP_KERNEL); - if (!percpu->scribble) - percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - - if (!percpu->scribble || - (conf->level == 6 && !percpu->spare_page)) { - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); + if (alloc_scratch_buffer(conf, percpu)) { pr_err("%s: failed memory allocation for cpu%ld\n", __func__, cpu); return notifier_from_errno(-ENOMEM); @@ -5069,10 +5107,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, break; case CPU_DEAD: case CPU_DEAD_FROZEN: - safe_put_page(percpu->spare_page); - kfree(percpu->scribble); - percpu->spare_page = NULL; - percpu->scribble = NULL; + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); break; default: break; @@ -5084,40 +5119,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, static int raid5_alloc_percpu(struct r5conf *conf) { unsigned long cpu; - struct page *spare_page; - struct raid5_percpu __percpu *allcpus; - void *scribble; - int err; + int err = 0; - allcpus = alloc_percpu(struct raid5_percpu); - if (!allcpus) + conf->percpu = alloc_percpu(struct raid5_percpu); + if (!conf->percpu) return -ENOMEM; - conf->percpu = allcpus; + +#ifdef CONFIG_HOTPLUG_CPU + conf->cpu_notify.notifier_call = raid456_cpu_notify; + conf->cpu_notify.priority = 0; + err = register_cpu_notifier(&conf->cpu_notify); + if (err) + return err; +#endif get_online_cpus(); - err = 0; for_each_present_cpu(cpu) { - if (conf->level == 6) { - spare_page = alloc_page(GFP_KERNEL); - if (!spare_page) { - err = -ENOMEM; - break; - } - per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; - } - scribble = kmalloc(conf->scribble_len, GFP_KERNEL); - if (!scribble) { - err = -ENOMEM; + err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + if (err) { + pr_err("%s: failed memory allocation for cpu%ld\n", + __func__, cpu); break; } - per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; } -#ifdef CONFIG_HOTPLUG_CPU - conf->cpu_notify.notifier_call = raid456_cpu_notify; - conf->cpu_notify.priority = 0; - if (err == 0) - err = register_cpu_notifier(&conf->cpu_notify); -#endif put_online_cpus(); return err; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b0b663b119a8..70c49329ca9a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -306,6 +306,7 @@ enum { STRIPE_SYNC_REQUESTED, STRIPE_SYNCING, STRIPE_INSYNC, + STRIPE_REPLACED, STRIPE_PREREAD_ACTIVE, STRIPE_DELAYED, STRIPE_DEGRADED, |