aboutsummaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig4
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bcache/bcache.h8
-rw-r--r--drivers/md/bcache/bset.c63
-rw-r--r--drivers/md/bcache/btree.c29
-rw-r--r--drivers/md/bcache/closure.c6
-rw-r--r--drivers/md/bcache/io.c2
-rw-r--r--drivers/md/bcache/journal.c42
-rw-r--r--drivers/md/bcache/request.c23
-rw-r--r--drivers/md/bcache/super.c31
-rw-r--r--drivers/md/bcache/sysfs.c8
-rw-r--r--drivers/md/bcache/util.c11
-rw-r--r--drivers/md/bcache/util.h12
-rw-r--r--drivers/md/bcache/writeback.c42
-rw-r--r--drivers/md/dm-bufio.c6
-rw-r--r--drivers/md/dm-builtin.c48
-rw-r--r--drivers/md/dm-cache-metadata.c12
-rw-r--r--drivers/md/dm-cache-target.c67
-rw-r--r--drivers/md/dm-crypt.c81
-rw-r--r--drivers/md/dm-delay.c23
-rw-r--r--drivers/md/dm-io.c22
-rw-r--r--drivers/md/dm-ioctl.c3
-rw-r--r--drivers/md/dm-log-userspace-transfer.c2
-rw-r--r--drivers/md/dm-mpath.c44
-rw-r--r--drivers/md/dm-raid.c13
-rw-r--r--drivers/md/dm-snap-persistent.c20
-rw-r--r--drivers/md/dm-snap.c76
-rw-r--r--drivers/md/dm-sysfs.c5
-rw-r--r--drivers/md/dm-table.c23
-rw-r--r--drivers/md/dm-thin-metadata.c46
-rw-r--r--drivers/md/dm-thin-metadata.h4
-rw-r--r--drivers/md/dm-thin.c48
-rw-r--r--drivers/md/dm-verity.c5
-rw-r--r--drivers/md/dm.c35
-rw-r--r--drivers/md/dm.h17
-rw-r--r--drivers/md/md.c59
-rw-r--r--drivers/md/md.h3
-rw-r--r--drivers/md/persistent-data/dm-array.c15
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c15
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h3
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h6
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c2
-rw-r--r--drivers/md/persistent-data/dm-btree.c24
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c6
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c36
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c5
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h17
-rw-r--r--drivers/md/raid1.c94
-rw-r--r--drivers/md/raid10.c56
-rw-r--r--drivers/md/raid5.c150
-rw-r--r--drivers/md/raid5.h1
51 files changed, 946 insertions, 428 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3bfc8f1da9fe..29cff90096ad 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -176,8 +176,12 @@ config MD_FAULTY
source "drivers/md/bcache/Kconfig"
+config BLK_DEV_DM_BUILTIN
+ boolean
+
config BLK_DEV_DM
tristate "Device mapper support"
+ select BLK_DEV_DM_BUILTIN
---help---
Device-mapper is a low level volume manager. It works by allowing
people to specify mappings for ranges of logical sectors. Various
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1439fd4ad9b1..3591a7292381 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_BCACHE) += bcache/
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
+obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3e15b42a4ab..6bc016eabdc9 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -437,6 +437,7 @@ struct bcache_device {
/* If nonzero, we're detaching/unregistering from cache set */
atomic_t detaching;
+ int flush_done;
atomic_long_t sectors_dirty;
unsigned long sectors_dirty_gc;
@@ -498,7 +499,7 @@ struct cached_dev {
*/
atomic_t has_dirty;
- struct ratelimit writeback_rate;
+ struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
/*
@@ -507,10 +508,9 @@ struct cached_dev {
*/
sector_t last_read;
- /* Number of writeback bios in flight */
- atomic_t in_flight;
+ /* Limit number of writeback bios in flight */
+ struct semaphore in_flight;
struct closure_with_timer writeback;
- struct closure_waitlist writeback_wait;
struct keybuf writeback_keys;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index cb4578a327b9..1b27cbd822e1 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -918,29 +918,59 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
/* Mergesort */
-static void btree_sort_fixup(struct btree_iter *iter)
+static void sort_key_next(struct btree_iter *iter,
+ struct btree_iter_set *i)
+{
+ i->k = bkey_next(i->k);
+
+ if (i->k == i->end)
+ *i = iter->data[--iter->used];
+}
+
+static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp)
{
while (iter->used > 1) {
struct btree_iter_set *top = iter->data, *i = top + 1;
- struct bkey *k;
if (iter->used > 2 &&
btree_iter_cmp(i[0], i[1]))
i++;
- for (k = i->k;
- k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
- k = bkey_next(k))
- if (top->k > i->k)
- __bch_cut_front(top->k, k);
- else if (KEY_SIZE(k))
- bch_cut_back(&START_KEY(k), top->k);
-
- if (top->k < i->k || k == i->k)
+ if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
break;
- heap_sift(iter, i - top, btree_iter_cmp);
+ if (!KEY_SIZE(i->k)) {
+ sort_key_next(iter, i);
+ heap_sift(iter, i - top, btree_iter_cmp);
+ continue;
+ }
+
+ if (top->k > i->k) {
+ if (bkey_cmp(top->k, i->k) >= 0)
+ sort_key_next(iter, i);
+ else
+ bch_cut_front(top->k, i->k);
+
+ heap_sift(iter, i - top, btree_iter_cmp);
+ } else {
+ /* can't happen because of comparison func */
+ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+
+ if (bkey_cmp(i->k, top->k) < 0) {
+ bkey_copy(tmp, top->k);
+
+ bch_cut_back(&START_KEY(i->k), tmp);
+ bch_cut_front(i->k, top->k);
+ heap_sift(iter, 0, btree_iter_cmp);
+
+ return tmp;
+ } else {
+ bch_cut_back(&START_KEY(i->k), top->k);
+ }
+ }
}
+
+ return NULL;
}
static void btree_mergesort(struct btree *b, struct bset *out,
@@ -948,15 +978,20 @@ static void btree_mergesort(struct btree *b, struct bset *out,
bool fixup, bool remove_stale)
{
struct bkey *k, *last = NULL;
+ BKEY_PADDED(k) tmp;
bool (*bad)(struct btree *, const struct bkey *) = remove_stale
? bch_ptr_bad
: bch_ptr_invalid;
while (!btree_iter_end(iter)) {
if (fixup && !b->level)
- btree_sort_fixup(iter);
+ k = btree_sort_fixup(iter, &tmp.k);
+ else
+ k = NULL;
+
+ if (!k)
+ k = bch_btree_iter_next(iter);
- k = bch_btree_iter_next(iter);
if (bad(b, k))
continue;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a5658f04e62..7d3deab11fce 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -326,10 +326,25 @@ static void do_btree_write(struct btree *b)
i->csum = btree_csum_set(b, i);
btree_bio_init(b);
- b->bio->bi_rw = REQ_META|WRITE_SYNC;
+ b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
bch_bio_map(b->bio, i);
+ /*
+ * If we're appending to a leaf node, we don't technically need FUA -
+ * this write just needs to be persisted before the next journal write,
+ * which will be marked FLUSH|FUA.
+ *
+ * Similarly if we're writing a new btree root - the pointer is going to
+ * be in the next journal entry.
+ *
+ * But if we're writing a new btree node (that isn't a root) or
+ * appending to a non leaf btree node, we need either FUA or a flush
+ * when we write the parent with the new pointer. FUA is cheaper than a
+ * flush, and writes appending to leaf nodes aren't blocking anything so
+ * just make all btree node writes FUA to keep things sane.
+ */
+
bkey_copy(&k.key, &b->key);
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
@@ -618,7 +633,7 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
return mca_can_free(c) * c->btree_pages;
/* Return -1 if we can't do anything right now */
- if (sc->gfp_mask & __GFP_WAIT)
+ if (sc->gfp_mask & __GFP_IO)
mutex_lock(&c->bucket_lock);
else if (!mutex_trylock(&c->bucket_lock))
return -1;
@@ -1419,8 +1434,10 @@ static void btree_gc_start(struct cache_set *c)
for_each_cache(ca, c, i)
for_each_bucket(b, ca) {
b->gc_gen = b->gen;
- if (!atomic_read(&b->pin))
+ if (!atomic_read(&b->pin)) {
SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+ SET_GC_SECTORS_USED(b, 0);
+ }
}
for (d = c->devices;
@@ -2140,6 +2157,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
void bch_btree_set_root(struct btree *b)
{
unsigned i;
+ struct closure cl;
+
+ closure_init_stack(&cl);
BUG_ON(!b->written);
@@ -2153,8 +2173,9 @@ void bch_btree_set_root(struct btree *b)
b->c->root = b;
__bkey_put(b->c, &b->key);
- bch_journal_meta(b->c, NULL);
+ bch_journal_meta(b->c, &cl);
pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
+ closure_sync(&cl);
}
/* Cache lookup */
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index bd05a9a8c7cf..9aba2017f0d1 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
} else {
struct closure *parent = cl->parent;
struct closure_waitlist *wait = closure_waitlist(cl);
+ closure_fn *destructor = cl->fn;
closure_debug_destroy(cl);
+ smp_mb();
atomic_set(&cl->remaining, -1);
if (wait)
closure_wake_up(wait);
- if (cl->fn)
- cl->fn(cl);
+ if (destructor)
+ destructor(cl);
if (parent)
closure_put(parent);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 48efd4dea645..d285cd49104c 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -97,6 +97,8 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
if (bio->bi_rw & REQ_DISCARD) {
ret = bio_alloc_bioset(gfp, 1, bs);
+ if (!ret)
+ return NULL;
idx = 0;
goto out;
}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8c8dfdcd9d4c..151a4ab90dd4 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -151,7 +151,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
pr_debug("%u journal buckets", ca->sb.njournal_buckets);
- /* Read journal buckets ordered by golden ratio hash to quickly
+ /*
+ * Read journal buckets ordered by golden ratio hash to quickly
* find a sequence of buckets with valid journal entries
*/
for (i = 0; i < ca->sb.njournal_buckets; i++) {
@@ -164,36 +165,45 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
goto bsearch;
}
- /* If that fails, check all the buckets we haven't checked
+ /*
+ * If that fails, check all the buckets we haven't checked
* already
*/
pr_debug("falling back to linear search");
- for (l = 0; l < ca->sb.njournal_buckets; l++) {
- if (test_bit(l, bitmap))
- continue;
-
+ for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
+ l < ca->sb.njournal_buckets;
+ l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
if (read_bucket(l))
goto bsearch;
- }
+
+ if (list_empty(list))
+ continue;
bsearch:
/* Binary search */
m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
pr_debug("starting binary search, l %u r %u", l, r);
while (l + 1 < r) {
+ seq = list_entry(list->prev, struct journal_replay,
+ list)->j.seq;
+
m = (l + r) >> 1;
+ read_bucket(m);
- if (read_bucket(m))
+ if (seq != list_entry(list->prev, struct journal_replay,
+ list)->j.seq)
l = m;
else
r = m;
}
- /* Read buckets in reverse order until we stop finding more
+ /*
+ * Read buckets in reverse order until we stop finding more
* journal entries
*/
- pr_debug("finishing up");
+ pr_debug("finishing up: m %u njournal_buckets %u",
+ m, ca->sb.njournal_buckets);
l = m;
while (1) {
@@ -221,9 +231,10 @@ bsearch:
}
}
- c->journal.seq = list_entry(list->prev,
- struct journal_replay,
- list)->j.seq;
+ if (!list_empty(list))
+ c->journal.seq = list_entry(list->prev,
+ struct journal_replay,
+ list)->j.seq;
return 0;
#undef read_bucket
@@ -420,7 +431,7 @@ static void do_journal_discard(struct cache *ca)
return;
}
- switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
+ switch (atomic_read(&ja->discard_in_flight)) {
case DISCARD_IN_FLIGHT:
return;
@@ -617,7 +628,7 @@ static void journal_write_unlocked(struct closure *cl)
bio_reset(bio);
bio->bi_sector = PTR_OFFSET(k, i);
bio->bi_bdev = ca->bdev;
- bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
+ bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
bio->bi_size = sectors << 9;
bio->bi_end_io = journal_write_endio;
@@ -681,6 +692,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl)
if (cl)
BUG_ON(!closure_wait(&w->wait, cl));
+ closure_flush(&c->journal.io);
__journal_try_write(c, true);
}
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e5ff12e52d5b..a30a0f8a41c0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -489,6 +489,12 @@ static void bch_insert_data_loop(struct closure *cl)
bch_queue_gc(op->c);
}
+ /*
+ * Journal writes are marked REQ_FLUSH; if the original write was a
+ * flush, it'll wait on the journal write.
+ */
+ bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
+
do {
unsigned i;
struct bkey *k;
@@ -716,7 +722,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
s->task = current;
s->orig_bio = bio;
s->write = (bio->bi_rw & REQ_WRITE) != 0;
- s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0;
+ s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
s->recoverable = 1;
s->start_time = jiffies;
@@ -1047,9 +1053,22 @@ static void request_write(struct cached_dev *dc, struct search *s)
trace_bcache_writethrough(s->orig_bio);
closure_bio_submit(bio, cl, s->d);
} else {
- s->op.cache_bio = bio;
trace_bcache_writeback(s->orig_bio);
bch_writeback_add(dc, bio_sectors(bio));
+ s->op.cache_bio = bio;
+
+ if (bio->bi_rw & REQ_FLUSH) {
+ /* Also need to send a flush to the backing device */
+ struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
+ dc->disk.bio_split);
+
+ flush->bi_rw = WRITE_FLUSH;
+ flush->bi_bdev = bio->bi_bdev;
+ flush->bi_end_io = request_endio;
+ flush->bi_private = cl;
+
+ closure_bio_submit(flush, cl, s->d);
+ }
}
out:
closure_call(&s->op.cl, bch_insert_data, NULL, cl);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f88e2b653a3f..b4713cea1913 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -704,7 +704,8 @@ static void bcache_device_detach(struct bcache_device *d)
atomic_set(&d->detaching, 0);
}
- bcache_device_unlink(d);
+ if (!d->flush_done)
+ bcache_device_unlink(d);
d->c->devices[d->id] = NULL;
closure_put(&d->c->caching);
@@ -781,6 +782,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
+ blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
+
return 0;
}
@@ -1014,6 +1017,14 @@ static void cached_dev_flush(struct closure *cl)
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
struct bcache_device *d = &dc->disk;
+ mutex_lock(&bch_register_lock);
+ d->flush_done = 1;
+
+ if (d->c)
+ bcache_device_unlink(d);
+
+ mutex_unlock(&bch_register_lock);
+
bch_cache_accounting_destroy(&dc->accounting);
kobject_del(&d->kobj);
@@ -1303,18 +1314,22 @@ static void cache_set_flush(struct closure *cl)
static void __cache_set_unregister(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
- struct cached_dev *dc, *t;
+ struct cached_dev *dc;
size_t i;
mutex_lock(&bch_register_lock);
- if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
- list_for_each_entry_safe(dc, t, &c->cached_devs, list)
- bch_cached_dev_detach(dc);
-
for (i = 0; i < c->nr_uuids; i++)
- if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
- bcache_device_stop(c->devices[i]);
+ if (c->devices[i]) {
+ if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+ dc = container_of(c->devices[i],
+ struct cached_dev, disk);
+ bch_cached_dev_detach(dc);
+ } else {
+ bcache_device_stop(c->devices[i]);
+ }
+ }
mutex_unlock(&bch_register_lock);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4d9cca47e4c6..e9bd6c0cca5b 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -214,7 +214,13 @@ STORE(__cached_dev)
}
if (attr == &sysfs_label) {
- memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
+ if (size > SB_LABEL_SIZE)
+ return -EINVAL;
+ memcpy(dc->sb.label, buf, size);
+ if (size < SB_LABEL_SIZE)
+ dc->sb.label[size] = '\0';
+ if (size && dc->sb.label[size - 1] == '\n')
+ dc->sb.label[size - 1] = '\0';
bch_write_bdev_super(dc, NULL);
if (dc->disk.c) {
memcpy(dc->disk.c->uuids[dc->disk.id].label,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index da3a99e85b1e..38a43f8d36ec 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
stats->last = now ?: 1;
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 577393e38c3a..43fd78affcea 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequency_units)
(ewma) >> factor; \
})
-struct ratelimit {
+struct bch_ratelimit {
+ /* Next time we want to do some work, in nanoseconds */
uint64_t next;
+
+ /*
+ * Rate at which we want to do work, in units per nanosecond
+ * The units here correspond to the units passed to bch_next_delay()
+ */
unsigned rate;
};
-static inline void ratelimit_reset(struct ratelimit *d)
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
{
d->next = local_clock();
}
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
#define __DIV_SAFE(n, d, zero) \
({ \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 2714ed3991d1..841f0490d4ef 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -91,11 +91,15 @@ static void update_writeback_rate(struct work_struct *work)
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{
+ uint64_t ret;
+
if (atomic_read(&dc->disk.detaching) ||
!dc->writeback_percent)
return 0;
- return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+ ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+
+ return min_t(uint64_t, ret, HZ);
}
/* Background writeback */
@@ -165,7 +169,7 @@ static void refill_dirty(struct closure *cl)
up_write(&dc->writeback_lock);
- ratelimit_reset(&dc->writeback_rate);
+ bch_ratelimit_reset(&dc->writeback_rate);
/* Punt to workqueue only so we don't recurse and blow the stack */
continue_at(cl, read_dirty, dirty_wq);
@@ -246,9 +250,7 @@ static void write_dirty_finish(struct closure *cl)
}
bch_keybuf_del(&dc->writeback_keys, w);
- atomic_dec_bug(&dc->in_flight);
-
- closure_wake_up(&dc->writeback_wait);
+ up(&dc->in_flight);
closure_return_with_destructor(cl, dirty_io_destructor);
}
@@ -278,7 +280,7 @@ static void write_dirty(struct closure *cl)
trace_bcache_write_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty_finish, dirty_wq);
+ continue_at(cl, write_dirty_finish, system_wq);
}
static void read_dirty_endio(struct bio *bio, int error)
@@ -299,7 +301,7 @@ static void read_dirty_submit(struct closure *cl)
trace_bcache_read_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
- continue_at(cl, write_dirty, dirty_wq);
+ continue_at(cl, write_dirty, system_wq);
}
static void read_dirty(struct closure *cl)
@@ -324,12 +326,8 @@ static void read_dirty(struct closure *cl)
if (delay > 0 &&
(KEY_START(&w->key) != dc->last_read ||
- jiffies_to_msecs(delay) > 50)) {
- w->private = NULL;
-
- closure_delay(&dc->writeback, delay);
- continue_at(cl, read_dirty, dirty_wq);
- }
+ jiffies_to_msecs(delay) > 50))
+ delay = schedule_timeout_uninterruptible(delay);
dc->last_read = KEY_OFFSET(&w->key);
@@ -354,15 +352,10 @@ static void read_dirty(struct closure *cl)
pr_debug("%s", pkey(&w->key));
- closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+ down(&dc->in_flight);
+ closure_call(&io->cl, read_dirty_submit, NULL, cl);
delay = writeback_delay(dc, KEY_SIZE(&w->key));
-
- atomic_inc(&dc->in_flight);
-
- if (!closure_wait_event(&dc->writeback_wait, cl,
- atomic_read(&dc->in_flight) < 64))
- continue_at(cl, read_dirty, dirty_wq);
}
if (0) {
@@ -372,11 +365,16 @@ err:
bch_keybuf_del(&dc->writeback_keys, w);
}
- refill_dirty(cl);
+ /*
+ * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+ * freed) before refilling again
+ */
+ continue_at(cl, refill_dirty, dirty_wq);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
+ sema_init(&dc->in_flight, 64);
closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock);
@@ -406,7 +404,7 @@ void bch_writeback_exit(void)
int __init bch_writeback_init(void)
{
- dirty_wq = create_singlethread_workqueue("bcache_writeback");
+ dirty_wq = create_workqueue("bcache_writeback");
if (!dirty_wq)
return -ENOMEM;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0387e05cdb98..c9b4ca9e0696 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -462,6 +462,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
c->n_buffers[dirty]++;
b->list_mode = dirty;
list_move(&b->lru_list, &c->lru[dirty]);
+ b->last_accessed = jiffies;
}
/*----------------------------------------------------------------
@@ -1660,6 +1661,11 @@ static int __init dm_bufio_init(void)
{
__u64 mem;
+ dm_bufio_allocated_kmem_cache = 0;
+ dm_bufio_allocated_get_free_pages = 0;
+ dm_bufio_allocated_vmalloc = 0;
+ dm_bufio_current_allocated = 0;
+
memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
new file mode 100644
index 000000000000..6c9049c51b2b
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,48 @@
+#include "dm.h"
+
+/*
+ * The kobject release method must not be placed in the module itself,
+ * otherwise we are subject to module unload races.
+ *
+ * The release method is called when the last reference to the kobject is
+ * dropped. It may be called by any other kernel code that drops the last
+ * reference.
+ *
+ * The release method suffers from module unload race. We may prevent the
+ * module from being unloaded at the start of the release method (using
+ * increased module reference count or synchronizing against the release
+ * method), however there is no way to prevent the module from being
+ * unloaded at the end of the release method.
+ *
+ * If this code were placed in the dm module, the following race may
+ * happen:
+ * 1. Some other process takes a reference to dm kobject
+ * 2. The user issues ioctl function to unload the dm device
+ * 3. dm_sysfs_exit calls kobject_put, however the object is not released
+ * because of the other reference taken at step 1
+ * 4. dm_sysfs_exit waits on the completion
+ * 5. The other process that took the reference in step 1 drops it,
+ * dm_kobject_release is called from this process
+ * 6. dm_kobject_release calls complete()
+ * 7. a reschedule happens before dm_kobject_release returns
+ * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference
+ * count is decremented
+ * 9. The user unloads the dm module
+ * 10. The other process that was rescheduled in step 7 continues to run,
+ * it is now executing code in unloaded module, so it crashes
+ *
+ * Note that if the process that takes the foreign reference to dm kobject
+ * has a low priority and the system is sufficiently loaded with
+ * higher-priority processes that prevent the low-priority process from
+ * being scheduled long enough, this bug may really happen.
+ *
+ * In order to fix this module unload race, we place the release method
+ * into a helper code that is compiled directly into the kernel.
+ */
+
+void dm_kobject_release(struct kobject *kobj)
+{
+ complete(dm_get_completion_from_kobject(kobj));
+}
+
+EXPORT_SYMBOL(dm_kobject_release);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 1af7255bbffb..de737ba1d351 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -384,6 +384,15 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
disk_super = dm_block_data(sblock);
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)cmd->data_block_size);
+ r = -EINVAL;
+ goto bad;
+ }
+
r = __check_incompat_features(disk_super, cmd);
if (r < 0)
goto bad;
@@ -511,8 +520,9 @@ static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
disk_super = dm_block_data(sblock);
update_flags(disk_super, mutator);
read_superblock_fields(cmd, disk_super);
+ dm_bm_unlock(sblock);
- return dm_bm_flush_and_unlock(cmd->bm, sblock);
+ return dm_bm_flush(cmd->bm);
}
static int __begin_transaction(struct dm_cache_metadata *cmd)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index df44b60e66f2..677973641d2b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -151,6 +151,9 @@ struct cache {
atomic_t nr_migrations;
wait_queue_head_t migration_wait;
+ wait_queue_head_t quiescing_wait;
+ atomic_t quiescing_ack;
+
/*
* cache_size entries, dirty if set
*/
@@ -742,8 +745,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
static void cleanup_migration(struct dm_cache_migration *mg)
{
- dec_nr_migrations(mg->cache);
+ struct cache *cache = mg->cache;
free_migration(mg);
+ dec_nr_migrations(cache);
}
static void migration_failure(struct dm_cache_migration *mg)
@@ -857,12 +861,13 @@ static void issue_copy_real(struct dm_cache_migration *mg)
int r;
struct dm_io_region o_region, c_region;
struct cache *cache = mg->cache;
+ sector_t cblock = from_cblock(mg->cblock);
o_region.bdev = cache->origin_dev->bdev;
o_region.count = cache->sectors_per_block;
c_region.bdev = cache->cache_dev->bdev;
- c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
+ c_region.sector = cblock * cache->sectors_per_block;
c_region.count = cache->sectors_per_block;
if (mg->writeback || mg->demote) {
@@ -1340,34 +1345,51 @@ static void writeback_some_dirty_blocks(struct cache *cache)
/*----------------------------------------------------------------
* Main worker loop
*--------------------------------------------------------------*/
-static void start_quiescing(struct cache *cache)
+static bool is_quiescing(struct cache *cache)
{
+ int r;
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
- cache->quiescing = 1;
+ r = cache->quiescing;
spin_unlock_irqrestore(&cache->lock, flags);
+
+ return r;
}
-static void stop_quiescing(struct cache *cache)
+static void ack_quiescing(struct cache *cache)
+{
+ if (is_quiescing(cache)) {
+ atomic_inc(&cache->quiescing_ack);
+ wake_up(&cache->quiescing_wait);
+ }
+}
+
+static void wait_for_quiescing_ack(struct cache *cache)
+{
+ wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
+}
+
+static void start_quiescing(struct cache *cache)
{
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
- cache->quiescing = 0;
+ cache->quiescing = true;
spin_unlock_irqrestore(&cache->lock, flags);
+
+ wait_for_quiescing_ack(cache);
}
-static bool is_quiescing(struct cache *cache)
+static void stop_quiescing(struct cache *cache)
{
- int r;
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
- r = cache->quiescing;
+ cache->quiescing = false;
spin_unlock_irqrestore(&cache->lock, flags);
- return r;
+ atomic_set(&cache->quiescing_ack, 0);
}
static void wait_for_migrations(struct cache *cache)
@@ -1414,16 +1436,15 @@ static void do_worker(struct work_struct *ws)
struct cache *cache = container_of(ws, struct cache, worker);
do {
- if (!is_quiescing(cache))
+ if (!is_quiescing(cache)) {
+ writeback_some_dirty_blocks(cache);
+ process_deferred_writethrough_bios(cache);
process_deferred_bios(cache);
+ }
process_migrations(cache, &cache->quiesced_migrations, issue_copy);
process_migrations(cache, &cache->completed_migrations, complete_migration);
- writeback_some_dirty_blocks(cache);
-
- process_deferred_writethrough_bios(cache);
-
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
@@ -1436,6 +1457,9 @@ static void do_worker(struct work_struct *ws)
process_migrations(cache, &cache->need_commit_migrations,
migration_success_post_commit);
}
+
+ ack_quiescing(cache);
+
} while (more_work(cache));
}
@@ -1930,6 +1954,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
ti->discard_zeroes_data_unsupported = true;
+ /* Discard bios must be split on a block boundary */
+ ti->split_discard_bios = true;
cache->features = ca->features;
ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -1998,6 +2024,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
atomic_set(&cache->nr_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
+ init_waitqueue_head(&cache->quiescing_wait);
+ atomic_set(&cache->quiescing_ack, 0);
+
r = -ENOMEM;
cache->nr_dirty = 0;
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2148,20 +2177,18 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
bool discarded_block;
struct dm_bio_prison_cell *cell;
struct policy_result lookup_result;
- struct per_bio_data *pb;
+ struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
- if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
+ if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
/*
* This can only occur if the io goes to a partial block at
* the end of the origin device. We don't cache these.
* Just remap to the origin and carry on.
*/
- remap_to_origin_clear_discard(cache, bio, block);
+ remap_to_origin(cache, bio);
return DM_MAPIO_REMAPPED;
}
- pb = init_per_bio_data(bio, pb_data_size);
-
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
defer_bio(cache, bio);
return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6d2d41ae9e32..7409d79729ee 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,7 +18,6 @@
#include <linux/crypto.h>
#include <linux/workqueue.h>
#include <linux/backing-dev.h>
-#include <linux/percpu.h>
#include <linux/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
@@ -44,6 +43,7 @@ struct convert_context {
unsigned int idx_out;
sector_t cc_sector;
atomic_t cc_pending;
+ struct ablkcipher_request *req;
};
/*
@@ -105,15 +105,7 @@ struct iv_lmk_private {
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
/*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
- struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
*/
struct crypt_config {
struct dm_dev *dev;
@@ -143,12 +135,6 @@ struct crypt_config {
sector_t iv_offset;
unsigned int iv_size;
- /*
- * Duplicated per cpu state. Access through
- * per_cpu_ptr() only.
- */
- struct crypt_cpu __percpu *cpu;
-
/* ESSIV: struct crypto_cipher *essiv_tfm */
void *iv_private;
struct crypto_ablkcipher **tfms;
@@ -184,11 +170,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
- return this_cpu_ptr(cc->cpu);
-}
-
/*
* Use this to access cipher attributes that are the same for each CPU.
*/
@@ -738,16 +719,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
static void crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
- if (!this_cc->req)
- this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ if (!ctx->req)
+ ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
- ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
- ablkcipher_request_set_callback(this_cc->req,
+ ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+ ablkcipher_request_set_callback(ctx->req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+ kcryptd_async_done, dmreq_of_req(cc, ctx->req));
}
/*
@@ -756,7 +736,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
int r;
atomic_set(&ctx->cc_pending, 1);
@@ -768,7 +747,7 @@ static int crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->cc_pending);
- r = crypt_convert_block(cc, ctx, this_cc->req);
+ r = crypt_convert_block(cc, ctx, ctx->req);
switch (r) {
/* async */
@@ -777,7 +756,7 @@ static int crypt_convert(struct crypt_config *cc,
INIT_COMPLETION(ctx->restart);
/* fall through*/
case -EINPROGRESS:
- this_cc->req = NULL;
+ ctx->req = NULL;
ctx->cc_sector++;
continue;
@@ -876,6 +855,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
io->sector = sector;
io->error = 0;
io->base_io = NULL;
+ io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
return io;
@@ -901,6 +881,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (!atomic_dec_and_test(&io->io_pending))
return;
+ if (io->ctx.req)
+ mempool_free(io->ctx.req, cc->req_pool);
mempool_free(io, cc->io_pool);
if (likely(!base_io))
@@ -1326,8 +1308,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = ti->private;
- struct crypt_cpu *cpu_cc;
- int cpu;
ti->private = NULL;
@@ -1339,13 +1319,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->crypt_queue)
destroy_workqueue(cc->crypt_queue);
- if (cc->cpu)
- for_each_possible_cpu(cpu) {
- cpu_cc = per_cpu_ptr(cc->cpu, cpu);
- if (cpu_cc->req)
- mempool_free(cpu_cc->req, cc->req_pool);
- }
-
crypt_free_tfms(cc);
if (cc->bs)
@@ -1364,9 +1337,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- if (cc->cpu)
- free_percpu(cc->cpu);
-
kzfree(cc->cipher);
kzfree(cc->cipher_string);
@@ -1421,13 +1391,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
- cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
- __alignof__(struct crypt_cpu));
- if (!cc->cpu) {
- ti->error = "Cannot allocate per cpu state";
- goto bad_mem;
- }
-
/*
* For compatibility with the original dm-crypt mapping format, if
* only the cipher name is supplied, use cbc-plain.
@@ -1543,6 +1506,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned int key_size, opt_params;
unsigned long long tmpll;
int ret;
+ size_t iv_size_padding;
struct dm_arg_set as;
const char *opt_string;
char dummy;
@@ -1579,12 +1543,23 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->dmreq_start = sizeof(struct ablkcipher_request);
cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
- cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
- cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
- ~(crypto_tfm_ctx_alignment() - 1);
+ cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
+
+ if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+ /* Allocate the padding exactly */
+ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
+ & crypto_ablkcipher_alignmask(any_tfm(cc));
+ } else {
+ /*
+ * If the cipher requires greater alignment than kmalloc
+ * alignment, we don't know the exact position of the
+ * initialization vector. We must assume worst case.
+ */
+ iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
+ }
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
- sizeof(struct dm_crypt_request) + cc->iv_size);
+ sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 496d5f3646a5..2f91d6d4a2cc 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -20,6 +20,7 @@
struct delay_c {
struct timer_list delay_timer;
struct mutex timer_lock;
+ struct workqueue_struct *kdelayd_wq;
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
atomic_t may_delay;
@@ -45,14 +46,13 @@ struct dm_delay_info {
static DEFINE_MUTEX(delayed_bios_lock);
-static struct workqueue_struct *kdelayd_wq;
static struct kmem_cache *delayed_cache;
static void handle_delayed_timer(unsigned long data)
{
struct delay_c *dc = (struct delay_c *)data;
- queue_work(kdelayd_wq, &dc->flush_expired_bios);
+ queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
}
static void queue_timeout(struct delay_c *dc, unsigned long expires)
@@ -191,6 +191,12 @@ out:
goto bad_dev_write;
}
+ dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
+ if (!dc->kdelayd_wq) {
+ DMERR("Couldn't start kdelayd");
+ goto bad_queue;
+ }
+
setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
@@ -203,6 +209,8 @@ out:
ti->private = dc;
return 0;
+bad_queue:
+ mempool_destroy(dc->delayed_pool);
bad_dev_write:
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
@@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
- flush_workqueue(kdelayd_wq);
+ destroy_workqueue(dc->kdelayd_wq);
dm_put_device(ti, dc->dev_read);
@@ -350,12 +358,6 @@ static int __init dm_delay_init(void)
{
int r = -ENOMEM;
- kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
- if (!kdelayd_wq) {
- DMERR("Couldn't start kdelayd");
- goto bad_queue;
- }
-
delayed_cache = KMEM_CACHE(dm_delay_info, 0);
if (!delayed_cache) {
DMERR("Couldn't create delayed bio cache.");
@@ -373,8 +375,6 @@ static int __init dm_delay_init(void)
bad_register:
kmem_cache_destroy(delayed_cache);
bad_memcache:
- destroy_workqueue(kdelayd_wq);
-bad_queue:
return r;
}
@@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void)
{
dm_unregister_target(&delay_target);
kmem_cache_destroy(delayed_cache);
- destroy_workqueue(kdelayd_wq);
}
/* Module hooks */
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834377c8..d1de1626a9d2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -10,6 +10,7 @@
#include <linux/device-mapper.h>
#include <linux/bio.h>
+#include <linux/completion.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -34,7 +35,7 @@ struct dm_io_client {
struct io {
unsigned long error_bits;
atomic_t count;
- struct task_struct *sleeper;
+ struct completion *wait;
struct dm_io_client *client;
io_notify_fn callback;
void *context;
@@ -122,8 +123,8 @@ static void dec_count(struct io *io, unsigned int region, int error)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
- if (io->sleeper)
- wake_up_process(io->sleeper);
+ if (io->wait)
+ complete(io->wait);
else {
unsigned long r = io->error_bits;
@@ -386,6 +387,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
*/
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
+ DECLARE_COMPLETION_ONSTACK(wait);
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
@@ -394,7 +396,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = current;
+ io->wait = &wait;
io->client = client;
io->vma_invalidate_address = dp->vma_invalidate_address;
@@ -402,15 +404,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
dispatch_io(rw, num_regions, where, dp, io, 1);
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (!atomic_read(&io->count))
- break;
-
- io_schedule();
- }
- set_current_state(TASK_RUNNING);
+ wait_for_completion_io(&wait);
if (error_bits)
*error_bits = io->error_bits;
@@ -433,7 +427,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = NULL;
+ io->wait = NULL;
io->client = client;
io->callback = fn;
io->context = context;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index aa04f0224642..81a79b739e97 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1644,7 +1644,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
}
if (!dmi) {
+ unsigned noio_flag;
+ noio_flag = memalloc_noio_save();
dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL);
+ memalloc_noio_restore(noio_flag);
if (dmi)
*param_flags |= DM_PARAMS_VMALLOC;
}
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a207259a..c69d0b787746 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -272,7 +272,7 @@ int dm_ulog_tfr_init(void)
r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
if (r) {
- cn_del_callback(&ulog_cn_id);
+ kfree(prealloced_cn_msg);
return r;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index bdf26f5bd326..0ba21b0f3972 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -86,6 +86,7 @@ struct multipath {
unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+ unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
@@ -497,7 +498,8 @@ static void process_queued_ios(struct work_struct *work)
(!pgpath && !m->queue_if_no_path))
must_queue = 0;
- if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
+ if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
+ !m->pg_init_disabled)
__pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
@@ -942,10 +944,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
static void flush_multipath_work(struct multipath *m)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 1;
+ spin_unlock_irqrestore(&m->lock, flags);
+
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
flush_workqueue(kmultipathd);
flush_work(&m->trigger_event);
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 0;
+ spin_unlock_irqrestore(&m->lock, flags);
}
static void multipath_dtr(struct dm_target *ti)
@@ -1164,7 +1176,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
spin_lock_irqsave(&m->lock, flags);
- if (m->pg_init_count <= m->pg_init_retries)
+ if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
m->pg_init_required = 1;
else
limit_reached = 1;
@@ -1284,8 +1296,17 @@ static int do_end_io(struct multipath *m, struct request *clone,
if (!error && !clone->errors)
return 0; /* I/O complete */
- if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ)
+ if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ) {
+ if ((clone->cmd_flags & REQ_WRITE_SAME) &&
+ !clone->q->limits.max_write_same_sectors) {
+ struct queue_limits *limits;
+
+ /* device doesn't really support WRITE SAME, disable it */
+ limits = dm_get_queue_limits(dm_table_get_md(m->ti->table));
+ limits->max_write_same_sectors = 0;
+ }
return error;
+ }
if (mpio->pgpath)
fail_path(mpio->pgpath);
@@ -1561,7 +1582,6 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long flags;
int r;
-again:
bdev = NULL;
mode = 0;
r = 0;
@@ -1579,7 +1599,7 @@ again:
}
if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
- r = -EAGAIN;
+ r = -ENOTCONN;
else if (!bdev)
r = -EIO;
@@ -1588,14 +1608,14 @@ again:
/*
* Only pass ioctls through if the device sizes match exactly.
*/
- if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
- r = scsi_verify_blk_ioctl(NULL, cmd);
+ if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
+ int err = scsi_verify_blk_ioctl(NULL, cmd);
+ if (err)
+ r = err;
+ }
- if (r == -EAGAIN && !fatal_signal_pending(current)) {
+ if (r == -ENOTCONN && !fatal_signal_pending(current))
queue_work(kmultipathd, &m->process_queued_ios);
- msleep(10);
- goto again;
- }
return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
@@ -1694,7 +1714,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 5, 1},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1d3fe1a40a9b..84cddccc0249 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -380,7 +380,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
static int validate_raid_redundancy(struct raid_set *rs)
{
unsigned i, rebuild_cnt = 0;
- unsigned rebuilds_per_group, copies, d;
+ unsigned rebuilds_per_group = 0, copies, d;
unsigned group_size, last_group_start;
for (i = 0; i < rs->md.raid_disks; i++)
@@ -785,8 +785,7 @@ struct dm_raid_superblock {
__le32 layout;
__le32 stripe_sectors;
- __u8 pad[452]; /* Round struct to 512 bytes. */
- /* Always set to 0 when writing. */
+ /* Remainder of a logical block is zero-filled when writing (see super_sync()). */
} __packed;
static int read_disk_sb(struct md_rdev *rdev, int size)
@@ -823,7 +822,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
test_bit(Faulty, &(rs->dev[i].rdev.flags)))
failed_devices |= (1ULL << i);
- memset(sb, 0, sizeof(*sb));
+ memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
sb->magic = cpu_to_le32(DM_RAID_MAGIC);
sb->features = cpu_to_le32(0); /* No features yet */
@@ -858,7 +857,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
uint64_t events_sb, events_refsb;
rdev->sb_start = 0;
- rdev->sb_size = sizeof(*sb);
+ rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev);
+ if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) {
+ DMERR("superblock size of a logical block is no longer valid");
+ return -EINVAL;
+ }
ret = read_disk_sb(rdev, rdev->sb_size);
if (ret)
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3ac415675b6c..2d2b1b7588d7 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
*/
INIT_WORK_ONSTACK(&req.work, do_metadata);
queue_work(ps->metadata_wq, &req.work);
- flush_work(&req.work);
+ flush_workqueue(ps->metadata_wq);
return req.result;
}
@@ -269,6 +269,14 @@ static chunk_t area_location(struct pstore *ps, chunk_t area)
return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area);
}
+static void skip_metadata(struct pstore *ps)
+{
+ uint32_t stride = ps->exceptions_per_area + 1;
+ chunk_t next_free = ps->next_free;
+ if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS)
+ ps->next_free++;
+}
+
/*
* Read or write a metadata area. Remembering to skip the first
* chunk which holds the header.
@@ -502,6 +510,8 @@ static int read_exceptions(struct pstore *ps,
ps->current_area--;
+ skip_metadata(ps);
+
return 0;
}
@@ -616,8 +626,6 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
struct dm_exception *e)
{
struct pstore *ps = get_info(store);
- uint32_t stride;
- chunk_t next_free;
sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
/* Is there enough room ? */
@@ -630,10 +638,8 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
* Move onto the next free pending, making sure to take
* into account the location of the metadata chunks.
*/
- stride = (ps->exceptions_per_area + 1);
- next_free = ++ps->next_free;
- if (sector_div(next_free, stride) == 1)
- ps->next_free++;
+ ps->next_free++;
+ skip_metadata(ps);
atomic_inc(&ps->pending_count);
return 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c434e5aab2df..944690bafd93 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -66,6 +66,18 @@ struct dm_snapshot {
atomic_t pending_exceptions_count;
+ /* Protected by "lock" */
+ sector_t exception_start_sequence;
+
+ /* Protected by kcopyd single-threaded callback */
+ sector_t exception_complete_sequence;
+
+ /*
+ * A list of pending exceptions that completed out of order.
+ * Protected by kcopyd single-threaded callback.
+ */
+ struct list_head out_of_order_list;
+
mempool_t *pending_pool;
struct dm_exception_table pending;
@@ -173,6 +185,14 @@ struct dm_snap_pending_exception {
*/
int started;
+ /* There was copying error. */
+ int copy_error;
+
+ /* A sequence number, it is used for in-order completion. */
+ sector_t exception_sequence;
+
+ struct list_head out_of_order_entry;
+
/*
* For writing a complete chunk, bypassing the copy.
*/
@@ -725,17 +745,16 @@ static int calc_max_buckets(void)
*/
static int init_hash_tables(struct dm_snapshot *s)
{
- sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
+ sector_t hash_size, cow_dev_size, max_buckets;
/*
* Calculate based on the size of the original volume or
* the COW volume...
*/
cow_dev_size = get_dev_size(s->cow->bdev);
- origin_dev_size = get_dev_size(s->origin->bdev);
max_buckets = calc_max_buckets();
- hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
+ hash_size = cow_dev_size >> s->store->chunk_shift;
hash_size = min(hash_size, max_buckets);
if (hash_size < 64)
@@ -1095,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->valid = 1;
s->active = 0;
atomic_set(&s->pending_exceptions_count, 0);
+ s->exception_start_sequence = 0;
+ s->exception_complete_sequence = 0;
+ INIT_LIST_HEAD(&s->out_of_order_list);
init_rwsem(&s->lock);
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
@@ -1444,6 +1466,19 @@ static void commit_callback(void *context, int success)
pending_complete(pe, success);
}
+static void complete_exception(struct dm_snap_pending_exception *pe)
+{
+ struct dm_snapshot *s = pe->snap;
+
+ if (unlikely(pe->copy_error))
+ pending_complete(pe, 0);
+
+ else
+ /* Update the metadata if we are persistent */
+ s->store->type->commit_exception(s->store, &pe->e,
+ commit_callback, pe);
+}
+
/*
* Called when the copy I/O has finished. kcopyd actually runs
* this code so don't block.
@@ -1453,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
struct dm_snap_pending_exception *pe = context;
struct dm_snapshot *s = pe->snap;
- if (read_err || write_err)
- pending_complete(pe, 0);
+ pe->copy_error = read_err || write_err;
- else
- /* Update the metadata if we are persistent */
- s->store->type->commit_exception(s->store, &pe->e,
- commit_callback, pe);
+ if (pe->exception_sequence == s->exception_complete_sequence) {
+ s->exception_complete_sequence++;
+ complete_exception(pe);
+
+ while (!list_empty(&s->out_of_order_list)) {
+ pe = list_entry(s->out_of_order_list.next,
+ struct dm_snap_pending_exception, out_of_order_entry);
+ if (pe->exception_sequence != s->exception_complete_sequence)
+ break;
+ s->exception_complete_sequence++;
+ list_del(&pe->out_of_order_entry);
+ complete_exception(pe);
+ }
+ } else {
+ struct list_head *lh;
+ struct dm_snap_pending_exception *pe2;
+
+ list_for_each_prev(lh, &s->out_of_order_list) {
+ pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
+ if (pe2->exception_sequence < pe->exception_sequence)
+ break;
+ }
+ list_add(&pe->out_of_order_entry, lh);
+ }
}
/*
@@ -1554,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s,
return NULL;
}
+ pe->exception_sequence = s->exception_start_sequence++;
+
dm_insert_exception(&s->pending, &pe->e);
return pe;
@@ -2193,7 +2249,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 11, 1},
+ .version = {1, 12, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 84d2b91e4efb..c62c5ab6aed5 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -86,6 +86,7 @@ static const struct sysfs_ops dm_sysfs_ops = {
static struct kobj_type dm_ktype = {
.sysfs_ops = &dm_sysfs_ops,
.default_attrs = dm_attrs,
+ .release = dm_kobject_release,
};
/*
@@ -104,5 +105,7 @@ int dm_sysfs_init(struct mapped_device *md)
*/
void dm_sysfs_exit(struct mapped_device *md)
{
- kobject_put(dm_kobject(md));
+ struct kobject *kobj = dm_kobject(md);
+ kobject_put(kobj);
+ wait_for_completion(dm_get_completion_from_kobject(kobj));
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1ff252ab7d46..bd88d3dade1e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -215,6 +215,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
+ if (!num_targets) {
+ kfree(t);
+ return -ENOMEM;
+ }
+
if (alloc_targets(t, num_targets)) {
kfree(t);
return -ENOMEM;
@@ -580,14 +585,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
/*
* Used to dynamically allocate the arg array.
+ *
+ * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
+ * process messages even if some device is suspended. These messages have a
+ * small fixed number of arguments.
+ *
+ * On the other hand, dm-switch needs to process bulk data using messages and
+ * excessive use of GFP_NOIO could cause trouble.
*/
static char **realloc_argv(unsigned *array_size, char **old_argv)
{
char **argv;
unsigned new_size;
+ gfp_t gfp;
- new_size = *array_size ? *array_size * 2 : 64;
- argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
+ if (*array_size) {
+ new_size = *array_size * 2;
+ gfp = GFP_KERNEL;
+ } else {
+ new_size = 8;
+ gfp = GFP_NOIO;
+ }
+ argv = kmalloc(new_size * sizeof(*argv), gfp);
if (argv) {
memcpy(argv, old_argv, *array_size * sizeof(*argv));
*array_size = new_size;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 60bce435f4fa..3b1503dc1f13 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -591,6 +591,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd)
disk_super = dm_block_data(sblock);
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)pmd->data_block_size);
+ r = -EINVAL;
+ goto bad_unlock_sblock;
+ }
+
r = __check_incompat_features(disk_super, pmd);
if (r < 0)
goto bad_unlock_sblock;
@@ -1349,6 +1358,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
return td->id;
}
+/*
+ * Check whether @time (of block creation) is older than @td's last snapshot.
+ * If so then the associated block is shared with the last snapshot device.
+ * Any block on a device created *after* the device last got snapshotted is
+ * necessarily not shared.
+ */
static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
{
return td->snapshotted_time > time;
@@ -1458,6 +1473,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
return r;
}
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
+{
+ int r;
+ uint32_t ref_count;
+
+ down_read(&pmd->root_lock);
+ r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+ if (!r)
+ *result = (ref_count != 0);
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
{
int r;
@@ -1469,6 +1498,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
return r;
}
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
+{
+ bool r = false;
+ struct dm_thin_device *td, *tmp;
+
+ down_read(&pmd->root_lock);
+ list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+ if (td->changed) {
+ r = td->changed;
+ break;
+ }
+ }
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
bool dm_thin_aborted_changes(struct dm_thin_device *td)
{
bool r;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 845ebbe589a9..8f4d62baf09b 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -161,6 +161,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
*/
bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
+
bool dm_thin_aborted_changes(struct dm_thin_device *td);
int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -181,6 +183,8 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
+
/*
* Returns -ENOSPC if the new size is too small and already allocated
* blocks would be lost.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 88f2f802d528..86a2a5e3b26b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -512,6 +512,7 @@ struct dm_thin_new_mapping {
unsigned quiesced:1;
unsigned prepared:1;
unsigned pass_discard:1;
+ unsigned definitely_not_shared:1;
struct thin_c *tc;
dm_block_t virt_block;
@@ -640,7 +641,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
*/
r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
if (r) {
- DMERR_LIMIT("dm_thin_insert_block() failed");
+ DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
+ dm_device_name(pool->pool_md), r);
+ set_pool_mode(pool, PM_READ_ONLY);
cell_error(pool, m->cell);
goto out;
}
@@ -681,7 +684,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
cell_defer_no_holder(tc, m->cell2);
if (m->pass_discard)
- remap_and_issue(tc, m->bio, m->data_block);
+ if (m->definitely_not_shared)
+ remap_and_issue(tc, m->bio, m->data_block);
+ else {
+ bool used = false;
+ if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
+ bio_endio(m->bio, 0);
+ else
+ remap_and_issue(tc, m->bio, m->data_block);
+ }
else
bio_endio(m->bio, 0);
@@ -749,13 +760,17 @@ static int ensure_next_mapping(struct pool *pool)
static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
{
- struct dm_thin_new_mapping *r = pool->next_mapping;
+ struct dm_thin_new_mapping *m = pool->next_mapping;
BUG_ON(!pool->next_mapping);
+ memset(m, 0, sizeof(struct dm_thin_new_mapping));
+ INIT_LIST_HEAD(&m->list);
+ m->bio = NULL;
+
pool->next_mapping = NULL;
- return r;
+ return m;
}
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -767,15 +782,10 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct pool *pool = tc->pool;
struct dm_thin_new_mapping *m = get_next_mapping(pool);
- INIT_LIST_HEAD(&m->list);
- m->quiesced = 0;
- m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
m->data_block = data_dest;
m->cell = cell;
- m->err = 0;
- m->bio = NULL;
if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
m->quiesced = 1;
@@ -838,15 +848,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
struct pool *pool = tc->pool;
struct dm_thin_new_mapping *m = get_next_mapping(pool);
- INIT_LIST_HEAD(&m->list);
m->quiesced = 1;
m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
m->data_block = data_block;
m->cell = cell;
- m->err = 0;
- m->bio = NULL;
/*
* If the whole block of data is being overwritten or we are not
@@ -1030,12 +1037,12 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
*/
m = get_next_mapping(pool);
m->tc = tc;
- m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
+ m->pass_discard = pool->pf.discard_passdown;
+ m->definitely_not_shared = !lookup_result.shared;
m->virt_block = block;
m->data_block = lookup_result.block;
m->cell = cell;
m->cell2 = cell2;
- m->err = 0;
m->bio = bio;
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
@@ -1315,9 +1322,9 @@ static void process_deferred_bios(struct pool *pool)
*/
if (ensure_next_mapping(pool)) {
spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_bios, bio);
bio_list_merge(&pool->deferred_bios, &bios);
spin_unlock_irqrestore(&pool->lock, flags);
-
break;
}
@@ -1337,7 +1344,8 @@ static void process_deferred_bios(struct pool *pool)
bio_list_init(&pool->deferred_flush_bios);
spin_unlock_irqrestore(&pool->lock, flags);
- if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
+ if (bio_list_empty(&bios) &&
+ !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
return;
if (commit_or_fallback(pool)) {
@@ -2639,7 +2647,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
*/
if (pt->adjusted_pf.discard_passdown) {
data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = data_limits->discard_granularity;
+ limits->discard_granularity = max(data_limits->discard_granularity,
+ pool->sectors_per_block << SECTOR_SHIFT);
} else
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
}
@@ -2776,6 +2785,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (get_pool_mode(tc->pool) == PM_FAIL) {
ti->error = "Couldn't open thin device, Pool is in fail mode";
+ r = -EINVAL;
goto bad_thin_open;
}
@@ -2787,7 +2797,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
if (r)
- goto bad_thin_open;
+ goto bad_target_max_io_len;
ti->num_flush_bios = 1;
ti->flush_supported = true;
@@ -2808,6 +2818,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
return 0;
+bad_target_max_io_len:
+ dm_pool_close_thin_device(tc->td);
bad_thin_open:
__pool_dec(tc->pool);
bad_pool_lookup:
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index b948fd864d45..0d2e812d424b 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -831,9 +831,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
for (i = v->levels - 1; i >= 0; i--) {
sector_t s;
v->hash_level_block[i] = hash_position;
- s = verity_position_at_level(v, v->data_blocks, i);
- s = (s >> v->hash_per_block_bits) +
- !!(s & ((1 << v->hash_per_block_bits) - 1));
+ s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1)
+ >> ((i + 1) * v->hash_per_block_bits);
if (hash_position + s < hash_position) {
ti->error = "Hash device offset overflow";
r = -E2BIG;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d5370a94b2c1..204a59fd872f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -184,8 +184,8 @@ struct mapped_device {
/* forced geometry settings */
struct hd_geometry geometry;
- /* sysfs handle */
- struct kobject kobj;
+ /* kobject and completion */
+ struct dm_kobject_holder kobj_holder;
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;
@@ -386,10 +386,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
- struct dm_table *map = dm_get_live_table(md);
+ struct dm_table *map;
struct dm_target *tgt;
int r = -ENOTTY;
+retry:
+ map = dm_get_live_table(md);
if (!map || !dm_table_get_size(map))
goto out;
@@ -410,6 +412,11 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
out:
dm_table_put(map);
+ if (r == -ENOTCONN) {
+ msleep(10);
+ goto retry;
+ }
+
return r;
}
@@ -1897,6 +1904,7 @@ static struct mapped_device *alloc_dev(int minor)
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
+ init_completion(&md->kobj_holder.completion);
md->disk->major = _major;
md->disk->first_minor = minor;
@@ -2212,6 +2220,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
}
/*
+ * The queue_limits are only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
+{
+ BUG_ON(!atomic_read(&md->holders));
+ return &md->queue->limits;
+}
+EXPORT_SYMBOL_GPL(dm_get_queue_limits);
+
+/*
* Fully initialize a request-based queue (->elevator, ->request_fn, etc).
*/
static int dm_init_request_based_queue(struct mapped_device *md)
@@ -2717,20 +2736,14 @@ struct gendisk *dm_disk(struct mapped_device *md)
struct kobject *dm_kobject(struct mapped_device *md)
{
- return &md->kobj;
+ return &md->kobj_holder.kobj;
}
-/*
- * struct mapped_device should not be exported outside of dm.c
- * so use this check to verify that kobj is part of md structure
- */
struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
{
struct mapped_device *md;
- md = container_of(kobj, struct mapped_device, kobj);
- if (&md->kobj != kobj)
- return NULL;
+ md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
if (test_bit(DMF_FREEING, &md->flags) ||
dm_deleting_md(md))
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 45b97da1bd06..9b3222f44835 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -15,6 +15,8 @@
#include <linux/list.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
+#include <linux/completion.h>
+#include <linux/kobject.h>
/*
* Suspend feature flags
@@ -125,12 +127,27 @@ void dm_interface_exit(void);
/*
* sysfs interface
*/
+struct dm_kobject_holder {
+ struct kobject kobj;
+ struct completion completion;
+};
+
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+ return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
+
int dm_sysfs_init(struct mapped_device *md);
void dm_sysfs_exit(struct mapped_device *md);
struct kobject *dm_kobject(struct mapped_device *md);
struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
/*
+ * The kobject helper
+ */
+void dm_kobject_release(struct kobject *kobj);
+
+/*
* Targets for linear and striped mappings
*/
int dm_linear_init(void);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9b82377a833b..aaf77b07bb72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1118,6 +1118,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = -1;
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
if (mddev->raid_disks == 0) {
@@ -1196,6 +1197,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
*/
if (ev1 < mddev->bitmap->events_cleared)
return 0;
+ if (ev1 < mddev->events)
+ set_bit(Bitmap_sync, &rdev->flags);
} else {
if (ev1 < mddev->events)
/* just a hot-add of a new device, leave raid_disk at -1 */
@@ -1604,6 +1607,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = -1;
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
if (mddev->raid_disks == 0) {
@@ -1686,6 +1690,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
*/
if (ev1 < mddev->bitmap->events_cleared)
return 0;
+ if (ev1 < mddev->events)
+ set_bit(Bitmap_sync, &rdev->flags);
} else {
if (ev1 < mddev->events)
/* just a hot-add of a new device, leave raid_disk at -1 */
@@ -2829,6 +2835,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
else
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
err = rdev->mddev->pers->
hot_add_disk(rdev->mddev, rdev);
if (err) {
@@ -3619,6 +3626,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->in_sync = 1;
del_timer_sync(&mddev->safemode_timer);
}
+ blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev_resume(mddev);
@@ -5760,6 +5768,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
set_bit(In_sync, &rdev->flags);
+ clear_bit(Bitmap_sync, &rdev->flags);
} else
rdev->raid_disk = -1;
} else
@@ -7329,8 +7338,10 @@ void md_do_sync(struct md_thread *thread)
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return;
- if (mddev->ro) /* never try to sync a read-only array */
+ if (mddev->ro) {/* never try to sync a read-only array */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return;
+ }
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -7436,6 +7447,19 @@ void md_do_sync(struct md_thread *thread)
rdev->recovery_offset < j)
j = rdev->recovery_offset;
rcu_read_unlock();
+
+ /* If there is a bitmap, we need to make sure all
+ * writes that started before we added a spare
+ * complete before we start doing a recovery.
+ * Otherwise the write might complete and (via
+ * bitmap_endwrite) set a bit in the bitmap after the
+ * recovery has checked that bit and skipped that
+ * region.
+ */
+ if (mddev->bitmap) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
}
printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -7693,24 +7717,11 @@ static int remove_and_add_spares(struct mddev *mddev,
if (test_bit(Faulty, &rdev->flags))
continue;
if (mddev->ro &&
- rdev->saved_raid_disk < 0)
+ ! (rdev->saved_raid_disk >= 0 &&
+ !test_bit(Bitmap_sync, &rdev->flags)))
continue;
rdev->recovery_offset = 0;
- if (rdev->saved_raid_disk >= 0 && mddev->in_sync) {
- spin_lock_irq(&mddev->write_lock);
- if (mddev->in_sync)
- /* OK, this device, which is in_sync,
- * will definitely be noticed before
- * the next write, so recovery isn't
- * needed.
- */
- rdev->recovery_offset = mddev->recovery_cp;
- spin_unlock_irq(&mddev->write_lock);
- }
- if (mddev->ro && rdev->recovery_offset != MaxSector)
- /* not safe to add this disk now */
- continue;
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
@@ -7788,9 +7799,13 @@ void md_check_recovery(struct mddev *mddev)
* As we only add devices that are already in-sync,
* we can activate the spares immediately.
*/
- clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
remove_and_add_spares(mddev, NULL);
- mddev->pers->spare_active(mddev);
+ /* There is no thread, but we need to call
+ * ->spare_active and clear saved_raid_disk
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_reap_sync_thread(mddev);
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
@@ -8086,6 +8101,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
u64 *p;
int lo, hi;
int rv = 1;
+ unsigned long flags;
if (bb->shift < 0)
/* badblocks are disabled */
@@ -8100,7 +8116,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
sectors = next - s;
}
- write_seqlock_irq(&bb->lock);
+ write_seqlock_irqsave(&bb->lock, flags);
p = bb->page;
lo = 0;
@@ -8216,7 +8232,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
bb->changed = 1;
if (!acknowledged)
bb->unacked_exist = 1;
- write_sequnlock_irq(&bb->lock);
+ write_sequnlock_irqrestore(&bb->lock, flags);
return rv;
}
@@ -8481,7 +8497,8 @@ static int md_notify_reboot(struct notifier_block *this,
if (mddev_trylock(mddev)) {
if (mddev->pers)
__md_stop_writes(mddev);
- mddev->safemode = 2;
+ if (mddev->persistent)
+ mddev->safemode = 2;
mddev_unlock(mddev);
}
need_delay = 1;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 653f992b687a..ebe748e57416 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -129,6 +129,9 @@ struct md_rdev {
enum flag_bits {
Faulty, /* device is known to have a fault */
In_sync, /* device is in_sync with rest of array */
+ Bitmap_sync, /* ..actually, not quite In_sync. Need a
+ * bitmap-based recovery to get fully in sync
+ */
Unmerged, /* device is being added to array and should
* be considerred for bvec_merge_fn but not
* yet for actual IO
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 172147eb1d40..1d75b1dc1e2e 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
* The shadow op will often be a noop. Only insert if it really
* copied data.
*/
- if (dm_block_location(*block) != b)
+ if (dm_block_location(*block) != b) {
+ /*
+ * dm_tm_shadow_block will have already decremented the old
+ * block, but it is still referenced by the btree. We
+ * increment to stop the insert decrementing it below zero
+ * when overwriting the old value.
+ */
+ dm_tm_inc(info->btree_info.tm, b);
r = insert_ablock(info, index, *block, root);
+ }
return r;
}
@@ -509,15 +517,18 @@ static int grow_add_tail_block(struct resize *resize)
static int grow_needs_more_blocks(struct resize *resize)
{
int r;
+ unsigned old_nr_blocks = resize->old_nr_full_blocks;
if (resize->old_nr_entries_in_last_block > 0) {
+ old_nr_blocks++;
+
r = grow_extend_tail_block(resize, resize->max_entries);
if (r)
return r;
}
r = insert_full_ablocks(resize->info, resize->size_of_block,
- resize->old_nr_full_blocks,
+ old_nr_blocks,
resize->new_nr_full_blocks,
resize->max_entries, resize->value,
&resize->root);
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 81b513890e2b..6372d0bea532 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -595,25 +595,14 @@ int dm_bm_unlock(struct dm_block *b)
}
EXPORT_SYMBOL_GPL(dm_bm_unlock);
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
- struct dm_block *superblock)
+int dm_bm_flush(struct dm_block_manager *bm)
{
- int r;
-
if (bm->read_only)
return -EPERM;
- r = dm_bufio_write_dirty_buffers(bm->bufio);
- if (unlikely(r)) {
- dm_bm_unlock(superblock);
- return r;
- }
-
- dm_bm_unlock(superblock);
-
return dm_bufio_write_dirty_buffers(bm->bufio);
}
-EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
+EXPORT_SYMBOL_GPL(dm_bm_flush);
void dm_bm_set_read_only(struct dm_block_manager *bm)
{
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index be5bff61be28..f74c0462e5e4 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -105,8 +105,7 @@ int dm_bm_unlock(struct dm_block *b);
*
* This method always blocks.
*/
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
- struct dm_block *superblock);
+int dm_bm_flush(struct dm_block_manager *bm);
/*
* Switches the bm to a read only mode. Once read-only mode
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index 37d367bb9aa8..bf2b80d5c470 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -42,6 +42,12 @@ struct btree_node {
} __packed;
+/*
+ * Locks a block using the btree node validator.
+ */
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+ struct dm_block **result);
+
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
struct dm_btree_value_type *vt);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
index cf9fd676ae44..1b5e13ec7f96 100644
--- a/drivers/md/persistent-data/dm-btree-spine.c
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -92,7 +92,7 @@ struct dm_block_validator btree_node_validator = {
/*----------------------------------------------------------------*/
-static int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
+int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
struct dm_block **result)
{
return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 35865425e4b4..0a7592e88811 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -812,22 +812,26 @@ EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
* FIXME: We shouldn't use a recursive algorithm when we have limited stack
* space. Also this only works for single level trees.
*/
-static int walk_node(struct ro_spine *s, dm_block_t block,
+static int walk_node(struct dm_btree_info *info, dm_block_t block,
int (*fn)(void *context, uint64_t *keys, void *leaf),
void *context)
{
int r;
unsigned i, nr;
+ struct dm_block *node;
struct btree_node *n;
uint64_t keys;
- r = ro_step(s, block);
- n = ro_node(s);
+ r = bn_read_lock(info, block, &node);
+ if (r)
+ return r;
+
+ n = dm_block_data(node);
nr = le32_to_cpu(n->header.nr_entries);
for (i = 0; i < nr; i++) {
if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
- r = walk_node(s, value64(n, i), fn, context);
+ r = walk_node(info, value64(n, i), fn, context);
if (r)
goto out;
} else {
@@ -839,7 +843,7 @@ static int walk_node(struct ro_spine *s, dm_block_t block,
}
out:
- ro_pop(s);
+ dm_tm_unlock(info->tm, node);
return r;
}
@@ -847,15 +851,7 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
int (*fn)(void *context, uint64_t *keys, void *leaf),
void *context)
{
- int r;
- struct ro_spine spine;
-
BUG_ON(info->levels > 1);
-
- init_ro_spine(&spine, info);
- r = walk_node(&spine, root, fn, context);
- exit_ro_spine(&spine);
-
- return r;
+ return walk_node(info, root, fn, context);
}
EXPORT_SYMBOL_GPL(dm_btree_walk);
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 3e7a88d99eb0..0d240373ffab 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
return -EINVAL;
}
+ /*
+ * We need to set this before the dm_tm_new_block() call below.
+ */
+ ll->nr_blocks = nr_blocks;
for (i = old_blocks; i < blocks; i++) {
struct dm_block *b;
struct disk_index_entry idx;
@@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
if (r < 0)
return r;
+
idx.blocknr = cpu_to_le64(dm_block_location(b));
r = dm_tm_unlock(ll->tm, b);
@@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
return r;
}
- ll->nr_blocks = nr_blocks;
return 0;
}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 1c959684caef..afb419e514bf 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
int r = sm_metadata_new_block_(sm, b);
- if (r)
+ if (r) {
DMERR("unable to allocate new metadata block");
+ return r;
+ }
r = sm_metadata_get_nr_free(sm, &count);
- if (r)
+ if (r) {
DMERR("couldn't get free block count");
+ return r;
+ }
check_threshold(&smm->threshold, count);
@@ -604,20 +608,38 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
* Flick into a mode where all blocks get allocated in the new area.
*/
smm->begin = old_len;
- memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
+ memcpy(sm, &bootstrap_ops, sizeof(*sm));
/*
* Extend.
*/
r = sm_ll_extend(&smm->ll, extra_blocks);
+ if (r)
+ goto out;
/*
- * Switch back to normal behaviour.
+ * We repeatedly increment then commit until the commit doesn't
+ * allocate any new blocks.
*/
- memcpy(&smm->sm, &ops, sizeof(smm->sm));
- for (i = old_len; !r && i < smm->begin; i++)
- r = sm_ll_inc(&smm->ll, i, &ev);
+ do {
+ for (i = old_len; !r && i < smm->begin; i++) {
+ r = sm_ll_inc(&smm->ll, i, &ev);
+ if (r)
+ goto out;
+ }
+ old_len = smm->begin;
+
+ r = sm_ll_commit(&smm->ll);
+ if (r)
+ goto out;
+ } while (old_len != smm->begin);
+
+out:
+ /*
+ * Switch back to normal behaviour.
+ */
+ memcpy(sm, &ops, sizeof(*sm));
return r;
}
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 81da1a26042e..3bc30a0ae3d6 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -154,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm)
if (r < 0)
return r;
- return 0;
+ return dm_bm_flush(tm->bm);
}
EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
@@ -164,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
return -EWOULDBLOCK;
wipe_shadow_table(tm);
+ dm_bm_unlock(root);
- return dm_bm_flush_and_unlock(tm->bm, root);
+ return dm_bm_flush(tm->bm);
}
EXPORT_SYMBOL_GPL(dm_tm_commit);
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index b5b139076ca5..2772ed2a781a 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac
/*
* We use a 2-phase commit here.
*
- * i) In the first phase the block manager is told to start flushing, and
- * the changes to the space map are written to disk. You should interrogate
- * your particular space map to get detail of its root node etc. to be
- * included in your superblock.
+ * i) Make all changes for the transaction *except* for the superblock.
+ * Then call dm_tm_pre_commit() to flush them to disk.
*
- * ii) @root will be committed last. You shouldn't use more than the
- * first 512 bytes of @root if you wish the transaction to survive a power
- * failure. You *must* have a write lock held on @root for both stage (i)
- * and (ii). The commit will drop the write lock.
+ * ii) Lock your superblock. Update. Then call dm_tm_commit() which will
+ * unlock the superblock and flush it. No other blocks should be updated
+ * during this period. Care should be taken to never unlock a partially
+ * updated superblock; perform any operations that could fail *before* you
+ * take the superblock lock.
*/
int dm_tm_pre_commit(struct dm_transaction_manager *tm);
-int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
+int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock);
/*
* These methods are the only way to get hold of a writeable block.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6e17f8181c4b..e885dbf08c40 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -94,6 +94,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
struct pool_info *pi = data;
struct r1bio *r1_bio;
struct bio *bio;
+ int need_pages;
int i, j;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
@@ -116,15 +117,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
* RESYNC_PAGES for each bio.
*/
if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
- j = pi->raid_disks;
+ need_pages = pi->raid_disks;
else
- j = 1;
- while(j--) {
+ need_pages = 1;
+ for (j = 0; j < need_pages; j++) {
bio = r1_bio->bios[j];
bio->bi_vcnt = RESYNC_PAGES;
if (bio_alloc_pages(bio, gfp_flags))
- goto out_free_bio;
+ goto out_free_pages;
}
/* If not user-requests, copy the page pointers to all bios */
if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -138,6 +139,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
return r1_bio;
+out_free_pages:
+ while (--j >= 0) {
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, r1_bio->bios[j], i)
+ __free_page(bv->bv_page);
+ }
+
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
@@ -1397,12 +1406,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} else
set_bit(Faulty, &rdev->flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
@@ -1479,6 +1488,7 @@ static int raid1_spare_active(struct mddev *mddev)
}
}
if (rdev
+ && rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
count++;
@@ -1848,6 +1858,40 @@ static int process_checks(struct r1bio *r1_bio)
int i;
int vcnt;
+ /* Fix variable parts of all bios */
+ vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
+ for (i = 0; i < conf->raid_disks * 2; i++) {
+ int j;
+ int size;
+ int uptodate;
+ struct bio *b = r1_bio->bios[i];
+ if (b->bi_end_io != end_sync_read)
+ continue;
+ /* fixup the bio for reuse, but preserve BIO_UPTODATE */
+ uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
+ bio_reset(b);
+ if (!uptodate)
+ clear_bit(BIO_UPTODATE, &b->bi_flags);
+ b->bi_vcnt = vcnt;
+ b->bi_size = r1_bio->sectors << 9;
+ b->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ b->bi_bdev = conf->mirrors[i].rdev->bdev;
+ b->bi_end_io = end_sync_read;
+ b->bi_private = r1_bio;
+
+ size = b->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &b->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ }
+ }
for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
@@ -1856,17 +1900,18 @@ static int process_checks(struct r1bio *r1_bio)
break;
}
r1_bio->read_disk = primary;
- vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
for (i = 0; i < conf->raid_disks * 2; i++) {
int j;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
- int size;
+ int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
if (sbio->bi_end_io != end_sync_read)
continue;
+ /* Now we can 'fixup' the BIO_UPTODATE flag */
+ set_bit(BIO_UPTODATE, &sbio->bi_flags);
- if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+ if (uptodate) {
for (j = vcnt; j-- ; ) {
struct page *p, *s;
p = pbio->bi_io_vec[j].bv_page;
@@ -1881,33 +1926,12 @@ static int process_checks(struct r1bio *r1_bio)
if (j >= 0)
atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
- && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+ && uptodate)) {
/* No need to write to this device. */
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
continue;
}
- /* fixup the bio for reuse */
- bio_reset(sbio);
- sbio->bi_vcnt = vcnt;
- sbio->bi_size = r1_bio->sectors << 9;
- sbio->bi_sector = r1_bio->sector +
- conf->mirrors[i].rdev->data_offset;
- sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- sbio->bi_end_io = end_sync_read;
- sbio->bi_private = r1_bio;
-
- size = sbio->bi_size;
- for (j = 0; j < vcnt ; j++) {
- struct bio_vec *bi;
- bi = &sbio->bi_io_vec[j];
- bi->bv_offset = 0;
- if (size > PAGE_SIZE)
- bi->bv_len = PAGE_SIZE;
- else
- bi->bv_len = size;
- size -= PAGE_SIZE;
- }
bio_copy_data(sbio, pbio);
}
@@ -2027,7 +2051,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags))
+ !test_bit(Faulty, &rdev->flags))
r1_sync_page_io(rdev, sect, s,
conf->tmppage, WRITE);
}
@@ -2039,7 +2063,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
+ !test_bit(Faulty, &rdev->flags)) {
if (r1_sync_page_io(rdev, sect, s,
conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6ddae2501b9a..a1ea2a753912 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1321,7 +1321,7 @@ read_again:
/* Could not read all from this device, so we will
* need another r10_bio.
*/
- sectors_handled = (r10_bio->sectors + max_sectors
+ sectors_handled = (r10_bio->sector + max_sectors
- bio->bi_sector);
r10_bio->sectors = max_sectors;
spin_lock_irq(&conf->device_lock);
@@ -1329,7 +1329,7 @@ read_again:
bio->bi_phys_segments = 2;
else
bio->bi_phys_segments++;
- spin_unlock(&conf->device_lock);
+ spin_unlock_irq(&conf->device_lock);
/* Cannot call generic_make_request directly
* as that will be queued in __generic_make_request
* and subsequent mempool_alloc might block
@@ -1681,11 +1681,11 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
+ /*
+ * If recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1762,6 +1762,7 @@ static int raid10_spare_active(struct mddev *mddev)
}
sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
} else if (tmp->rdev
+ && tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
count++;
@@ -2075,11 +2076,17 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* both 'first' and 'i', so we just compare them.
* All vec entries are PAGE_SIZE;
*/
- for (j = 0; j < vcnt; j++)
+ int sectors = r10_bio->sectors;
+ for (j = 0; j < vcnt; j++) {
+ int len = PAGE_SIZE;
+ if (sectors < (len / 512))
+ len = sectors * 512;
if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
page_address(tbio->bi_io_vec[j].bv_page),
- fbio->bi_io_vec[j].bv_len))
+ len))
break;
+ sectors -= len/512;
+ }
if (j == vcnt)
continue;
atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
@@ -2262,12 +2269,18 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
wbio = r10_bio->devs[1].bio;
wbio2 = r10_bio->devs[1].repl_bio;
+ /* Need to test wbio2->bi_end_io before we call
+ * generic_make_request as if the former is NULL,
+ * the latter is free to free wbio2.
+ */
+ if (wbio2 && !wbio2->bi_end_io)
+ wbio2 = NULL;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
generic_make_request(wbio);
}
- if (wbio2 && wbio2->bi_end_io) {
+ if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
md_sync_acct(conf->mirrors[d].replacement->bdev,
bio_sectors(wbio2));
@@ -2909,14 +2922,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*/
if (mddev->bitmap == NULL &&
mddev->recovery_cp == MaxSector &&
+ mddev->reshape_position == MaxSector &&
+ !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
conf->fullsync == 0) {
*skipped = 1;
- max_sector = mddev->dev_sectors;
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
- test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- max_sector = mddev->resync_max_sectors;
- return max_sector - sector_nr;
+ return mddev->dev_sectors - sector_nr;
}
skipped:
@@ -2936,6 +2948,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*/
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
+ close_sync(conf);
return 0;
}
@@ -3186,10 +3199,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (j == conf->copies) {
/* Cannot recover, so abort the recovery or
* record a bad block */
- put_buf(r10_bio);
- if (rb2)
- atomic_dec(&rb2->remaining);
- r10_bio = rb2;
if (any_working) {
/* problem is that there are bad blocks
* on other device(s)
@@ -3221,6 +3230,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
mirror->recovery_disabled
= mddev->recovery_disabled;
}
+ put_buf(r10_bio);
+ if (rb2)
+ atomic_dec(&rb2->remaining);
+ r10_bio = rb2;
break;
}
}
@@ -3386,6 +3399,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (bio->bi_end_io == end_sync_read) {
md_sync_acct(bio->bi_bdev, nr_sectors);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
generic_make_request(bio);
}
}
@@ -3532,7 +3546,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
/* FIXME calc properly */
conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
- max(0,mddev->delta_disks)),
+ max(0,-mddev->delta_disks)),
GFP_KERNEL);
if (!conf->mirrors)
goto out;
@@ -3691,7 +3705,7 @@ static int run(struct mddev *mddev)
conf->geo.far_offset == 0)
goto out_free_conf;
if (conf->prev.far_copies != 1 &&
- conf->geo.far_offset == 0)
+ conf->prev.far_offset == 0)
goto out_free_conf;
}
@@ -4385,7 +4399,7 @@ read_more:
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ;
- read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
read_bio->bi_flags |= 1 << BIO_UPTODATE;
read_bio->bi_vcnt = 0;
read_bio->bi_size = 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 05e4a105b9c7..2332b5ced0dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -60,6 +60,10 @@
#include "raid0.h"
#include "bitmap.h"
+static bool devices_handle_discard_safely = false;
+module_param(devices_handle_discard_safely, bool, 0644);
+MODULE_PARM_DESC(devices_handle_discard_safely,
+ "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
/*
* Stripe cache
*/
@@ -668,6 +672,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
+ /*
+ * If this is discard request, set bi_vcnt 0. We don't
+ * want to confuse SCSI because SCSI will replace payload
+ */
+ if (rw & REQ_DISCARD)
+ bi->bi_vcnt = 0;
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
@@ -706,6 +716,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_size = STRIPE_SIZE;
+ /*
+ * If this is discard request, set bi_vcnt 0. We don't
+ * want to confuse SCSI because SCSI will replace payload
+ */
+ if (rw & REQ_DISCARD)
+ rbi->bi_vcnt = 0;
if (conf->mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
rbi, disk_devt(conf->mddev->gendisk),
@@ -1881,6 +1897,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (!uptodate) {
+ set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -2800,6 +2817,14 @@ static void handle_stripe_clean_event(struct r5conf *conf,
}
/* now that discard is done we can proceed with any sync */
clear_bit(STRIPE_DISCARD, &sh->state);
+ /*
+ * SCSI discard will change some bio fields and the stripe has
+ * no updated data, so remove it from hash list and the stripe
+ * will be reinitialized
+ */
+ spin_lock_irq(&conf->device_lock);
+ remove_hash(sh);
+ spin_unlock_irq(&conf->device_lock);
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
set_bit(STRIPE_HANDLE, &sh->state);
@@ -3371,7 +3396,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
*/
set_bit(R5_Insync, &dev->flags);
- if (rdev && test_bit(R5_WriteError, &dev->flags)) {
+ if (test_bit(R5_WriteError, &dev->flags)) {
/* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference(
@@ -3384,7 +3409,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
} else
clear_bit(R5_WriteError, &dev->flags);
}
- if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
+ if (test_bit(R5_MadeGood, &dev->flags)) {
/* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference(
@@ -3462,6 +3487,7 @@ static void handle_stripe(struct stripe_head *sh)
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
+ clear_bit(STRIPE_REPLACED, &sh->state);
}
spin_unlock(&sh->stripe_lock);
}
@@ -3539,6 +3565,8 @@ static void handle_stripe(struct stripe_head *sh)
set_bit(R5_Wantwrite, &dev->flags);
if (prexor)
continue;
+ if (s.failed > 1)
+ continue;
if (!test_bit(R5_Insync, &dev->flags) ||
((i == sh->pd_idx || i == sh->qd_idx) &&
s.failed == 0))
@@ -3607,19 +3635,23 @@ static void handle_stripe(struct stripe_head *sh)
handle_parity_checks5(conf, sh, &s, disks);
}
- if (s.replacing && s.locked == 0
- && !test_bit(STRIPE_INSYNC, &sh->state)) {
+ if ((s.replacing || s.syncing) && s.locked == 0
+ && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
+ && !test_bit(STRIPE_REPLACED, &sh->state)) {
/* Write out to replacement devices where possible */
for (i = 0; i < conf->raid_disks; i++)
- if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
- test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+ if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+ WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
set_bit(R5_WantReplace, &sh->dev[i].flags);
set_bit(R5_LOCKED, &sh->dev[i].flags);
s.locked++;
}
- set_bit(STRIPE_INSYNC, &sh->state);
+ if (s.replacing)
+ set_bit(STRIPE_INSYNC, &sh->state);
+ set_bit(STRIPE_REPLACED, &sh->state);
}
if ((s.syncing || s.replacing) && s.locked == 0 &&
+ !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state);
@@ -5011,23 +5043,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
return sectors * (raid_disks - conf->max_degraded);
}
+static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+ safe_put_page(percpu->spare_page);
+ kfree(percpu->scribble);
+ percpu->spare_page = NULL;
+ percpu->scribble = NULL;
+}
+
+static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+ if (conf->level == 6 && !percpu->spare_page)
+ percpu->spare_page = alloc_page(GFP_KERNEL);
+ if (!percpu->scribble)
+ percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+
+ if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
+ free_scratch_buffer(conf, percpu);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static void raid5_free_percpu(struct r5conf *conf)
{
- struct raid5_percpu *percpu;
unsigned long cpu;
if (!conf->percpu)
return;
- get_online_cpus();
- for_each_possible_cpu(cpu) {
- percpu = per_cpu_ptr(conf->percpu, cpu);
- safe_put_page(percpu->spare_page);
- kfree(percpu->scribble);
- }
#ifdef CONFIG_HOTPLUG_CPU
unregister_cpu_notifier(&conf->cpu_notify);
#endif
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu)
+ free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
put_online_cpus();
free_percpu(conf->percpu);
@@ -5053,15 +5105,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- if (conf->level == 6 && !percpu->spare_page)
- percpu->spare_page = alloc_page(GFP_KERNEL);
- if (!percpu->scribble)
- percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
-
- if (!percpu->scribble ||
- (conf->level == 6 && !percpu->spare_page)) {
- safe_put_page(percpu->spare_page);
- kfree(percpu->scribble);
+ if (alloc_scratch_buffer(conf, percpu)) {
pr_err("%s: failed memory allocation for cpu%ld\n",
__func__, cpu);
return notifier_from_errno(-ENOMEM);
@@ -5069,10 +5113,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- safe_put_page(percpu->spare_page);
- kfree(percpu->scribble);
- percpu->spare_page = NULL;
- percpu->scribble = NULL;
+ free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
break;
default:
break;
@@ -5084,40 +5125,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
static int raid5_alloc_percpu(struct r5conf *conf)
{
unsigned long cpu;
- struct page *spare_page;
- struct raid5_percpu __percpu *allcpus;
- void *scribble;
- int err;
+ int err = 0;
- allcpus = alloc_percpu(struct raid5_percpu);
- if (!allcpus)
+ conf->percpu = alloc_percpu(struct raid5_percpu);
+ if (!conf->percpu)
return -ENOMEM;
- conf->percpu = allcpus;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ conf->cpu_notify.notifier_call = raid456_cpu_notify;
+ conf->cpu_notify.priority = 0;
+ err = register_cpu_notifier(&conf->cpu_notify);
+ if (err)
+ return err;
+#endif
get_online_cpus();
- err = 0;
for_each_present_cpu(cpu) {
- if (conf->level == 6) {
- spare_page = alloc_page(GFP_KERNEL);
- if (!spare_page) {
- err = -ENOMEM;
- break;
- }
- per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
- }
- scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
- if (!scribble) {
- err = -ENOMEM;
+ err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+ if (err) {
+ pr_err("%s: failed memory allocation for cpu%ld\n",
+ __func__, cpu);
break;
}
- per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
}
-#ifdef CONFIG_HOTPLUG_CPU
- conf->cpu_notify.notifier_call = raid456_cpu_notify;
- conf->cpu_notify.priority = 0;
- if (err == 0)
- err = register_cpu_notifier(&conf->cpu_notify);
-#endif
put_online_cpus();
return err;
@@ -5585,7 +5615,7 @@ static int run(struct mddev *mddev)
mddev->queue->limits.discard_granularity = stripe;
/*
* unaligned part of discard request will be ignored, so can't
- * guarantee discard_zerors_data
+ * guarantee discard_zeroes_data
*/
mddev->queue->limits.discard_zeroes_data = 0;
@@ -5610,6 +5640,18 @@ static int run(struct mddev *mddev)
!bdev_get_queue(rdev->bdev)->
limits.discard_zeroes_data)
discard_supported = false;
+ /* Unfortunately, discard_zeroes_data is not currently
+ * a guarantee - just a hint. So we only allow DISCARD
+ * if the sysadmin has confirmed that only safe devices
+ * are in use by setting a module parameter.
+ */
+ if (!devices_handle_discard_safely) {
+ if (discard_supported) {
+ pr_info("md/raid456: discard support disabled due to uncertainty.\n");
+ pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
+ }
+ discard_supported = false;
+ }
}
if (discard_supported &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b0b663b119a8..70c49329ca9a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -306,6 +306,7 @@ enum {
STRIPE_SYNC_REQUESTED,
STRIPE_SYNCING,
STRIPE_INSYNC,
+ STRIPE_REPLACED,
STRIPE_PREREAD_ACTIVE,
STRIPE_DELAYED,
STRIPE_DEGRADED,