diff options
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r-- | drivers/md/bcache/bcache.h | 8 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 63 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 29 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 42 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 23 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 31 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 8 | ||||
-rw-r--r-- | drivers/md/bcache/util.c | 11 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 42 |
12 files changed, 201 insertions, 76 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b42a4ab..6bc016eabdc9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -437,6 +437,7 @@ struct bcache_device { /* If nonzero, we're detaching/unregistering from cache set */ atomic_t detaching; + int flush_done; atomic_long_t sectors_dirty; unsigned long sectors_dirty_gc; @@ -498,7 +499,7 @@ struct cached_dev { */ atomic_t has_dirty; - struct ratelimit writeback_rate; + struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; /* @@ -507,10 +508,9 @@ struct cached_dev { */ sector_t last_read; - /* Number of writeback bios in flight */ - atomic_t in_flight; + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; struct closure_with_timer writeback; - struct closure_waitlist writeback_wait; struct keybuf writeback_keys; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index cb4578a327b9..1b27cbd822e1 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -918,29 +918,59 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) /* Mergesort */ -static void btree_sort_fixup(struct btree_iter *iter) +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { while (iter->used > 1) { struct btree_iter_set *top = iter->data, *i = top + 1; - struct bkey *k; if (iter->used > 2 && btree_iter_cmp(i[0], i[1])) i++; - for (k = i->k; - k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; - k = bkey_next(k)) - if (top->k > i->k) - __bch_cut_front(top->k, k); - else if (KEY_SIZE(k)) - bch_cut_back(&START_KEY(k), top->k); - - if (top->k < i->k || k == i->k) + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) break; - heap_sift(iter, i - top, btree_iter_cmp); + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, btree_iter_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, btree_iter_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, btree_iter_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } } + + return NULL; } static void btree_mergesort(struct btree *b, struct bset *out, @@ -948,15 +978,20 @@ static void btree_mergesort(struct btree *b, struct bset *out, bool fixup, bool remove_stale) { struct bkey *k, *last = NULL; + BKEY_PADDED(k) tmp; bool (*bad)(struct btree *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; while (!btree_iter_end(iter)) { if (fixup && !b->level) - btree_sort_fixup(iter); + k = btree_sort_fixup(iter, &tmp.k); + else + k = NULL; + + if (!k) + k = bch_btree_iter_next(iter); - k = bch_btree_iter_next(iter); if (bad(b, k)) continue; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..7d3deab11fce 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -326,10 +326,25 @@ static void do_btree_write(struct btree *b) i->csum = btree_csum_set(b, i); btree_bio_init(b); - b->bio->bi_rw = REQ_META|WRITE_SYNC; + b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); @@ -618,7 +633,7 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) return mca_can_free(c) * c->btree_pages; /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_WAIT) + if (sc->gfp_mask & __GFP_IO) mutex_lock(&c->bucket_lock); else if (!mutex_trylock(&c->bucket_lock)) return -1; @@ -1419,8 +1434,10 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { b->gc_gen = b->gen; - if (!atomic_read(&b->pin)) + if (!atomic_read(&b->pin)) { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_SECTORS_USED(b, 0); + } } for (d = c->devices; @@ -2140,6 +2157,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) void bch_btree_set_root(struct btree *b) { unsigned i; + struct closure cl; + + closure_init_stack(&cl); BUG_ON(!b->written); @@ -2153,8 +2173,9 @@ void bch_btree_set_root(struct btree *b) b->c->root = b; __bkey_put(b->c, &b->key); - bch_journal_meta(b->c, NULL); + bch_journal_meta(b->c, &cl); pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); + closure_sync(&cl); } /* Cache lookup */ diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index bd05a9a8c7cf..9aba2017f0d1 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) } else { struct closure *parent = cl->parent; struct closure_waitlist *wait = closure_waitlist(cl); + closure_fn *destructor = cl->fn; closure_debug_destroy(cl); + smp_mb(); atomic_set(&cl->remaining, -1); if (wait) closure_wake_up(wait); - if (cl->fn) - cl->fn(cl); + if (destructor) + destructor(cl); if (parent) closure_put(parent); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..d285cd49104c 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -97,6 +97,8 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, if (bio->bi_rw & REQ_DISCARD) { ret = bio_alloc_bioset(gfp, 1, bs); + if (!ret) + return NULL; idx = 0; goto out; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..151a4ab90dd4 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -151,7 +151,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); pr_debug("%u journal buckets", ca->sb.njournal_buckets); - /* Read journal buckets ordered by golden ratio hash to quickly + /* + * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries */ for (i = 0; i < ca->sb.njournal_buckets; i++) { @@ -164,36 +165,45 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, goto bsearch; } - /* If that fails, check all the buckets we haven't checked + /* + * If that fails, check all the buckets we haven't checked * already */ pr_debug("falling back to linear search"); - for (l = 0; l < ca->sb.njournal_buckets; l++) { - if (test_bit(l, bitmap)) - continue; - + for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); + l < ca->sb.njournal_buckets; + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) if (read_bucket(l)) goto bsearch; - } + + if (list_empty(list)) + continue; bsearch: /* Binary search */ m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; + m = (l + r) >> 1; + read_bucket(m); - if (read_bucket(m)) + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) l = m; else r = m; } - /* Read buckets in reverse order until we stop finding more + /* + * Read buckets in reverse order until we stop finding more * journal entries */ - pr_debug("finishing up"); + pr_debug("finishing up: m %u njournal_buckets %u", + m, ca->sb.njournal_buckets); l = m; while (1) { @@ -221,9 +231,10 @@ bsearch: } } - c->journal.seq = list_entry(list->prev, - struct journal_replay, - list)->j.seq; + if (!list_empty(list)) + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; return 0; #undef read_bucket @@ -420,7 +431,7 @@ static void do_journal_discard(struct cache *ca) return; } - switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { + switch (atomic_read(&ja->discard_in_flight)) { case DISCARD_IN_FLIGHT: return; @@ -617,7 +628,7 @@ static void journal_write_unlocked(struct closure *cl) bio_reset(bio); bio->bi_sector = PTR_OFFSET(k, i); bio->bi_bdev = ca->bdev; - bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; + bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; bio->bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; @@ -681,6 +692,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl) if (cl) BUG_ON(!closure_wait(&w->wait, cl)); + closure_flush(&c->journal.io); __journal_try_write(c, true); } } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..a30a0f8a41c0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -489,6 +489,12 @@ static void bch_insert_data_loop(struct closure *cl) bch_queue_gc(op->c); } + /* + * Journal writes are marked REQ_FLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); + do { unsigned i; struct bkey *k; @@ -716,7 +722,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->task = current; s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; + s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; s->recoverable = 1; s->start_time = jiffies; @@ -1047,9 +1053,22 @@ static void request_write(struct cached_dev *dc, struct search *s) trace_bcache_writethrough(s->orig_bio); closure_bio_submit(bio, cl, s->d); } else { - s->op.cache_bio = bio; trace_bcache_writeback(s->orig_bio); bch_writeback_add(dc, bio_sectors(bio)); + s->op.cache_bio = bio; + + if (bio->bi_rw & REQ_FLUSH) { + /* Also need to send a flush to the backing device */ + struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, + dc->disk.bio_split); + + flush->bi_rw = WRITE_FLUSH; + flush->bi_bdev = bio->bi_bdev; + flush->bi_end_io = request_endio; + flush->bi_private = cl; + + closure_bio_submit(flush, cl, s->d); + } } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b653a3f..b4713cea1913 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -704,7 +704,8 @@ static void bcache_device_detach(struct bcache_device *d) atomic_set(&d->detaching, 0); } - bcache_device_unlink(d); + if (!d->flush_done) + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -781,6 +782,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); + blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + return 0; } @@ -1014,6 +1017,14 @@ static void cached_dev_flush(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); struct bcache_device *d = &dc->disk; + mutex_lock(&bch_register_lock); + d->flush_done = 1; + + if (d->c) + bcache_device_unlink(d); + + mutex_unlock(&bch_register_lock); + bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); @@ -1303,18 +1314,22 @@ static void cache_set_flush(struct closure *cl) static void __cache_set_unregister(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cached_dev *dc, *t; + struct cached_dev *dc; size_t i; mutex_lock(&bch_register_lock); - if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) - list_for_each_entry_safe(dc, t, &c->cached_devs, list) - bch_cached_dev_detach(dc); - for (i = 0; i < c->nr_uuids; i++) - if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) - bcache_device_stop(c->devices[i]); + if (c->devices[i]) { + if (!UUID_FLASH_ONLY(&c->uuids[i]) && + test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { + dc = container_of(c->devices[i], + struct cached_dev, disk); + bch_cached_dev_detach(dc); + } else { + bcache_device_stop(c->devices[i]); + } + } mutex_unlock(&bch_register_lock); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4d9cca47e4c6..e9bd6c0cca5b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -214,7 +214,13 @@ STORE(__cached_dev) } if (attr == &sysfs_label) { - memcpy(dc->sb.label, buf, SB_LABEL_SIZE); + if (size > SB_LABEL_SIZE) + return -EINVAL; + memcpy(dc->sb.label, buf, size); + if (size < SB_LABEL_SIZE) + dc->sb.label[size] = '\0'; + if (size && dc->sb.label[size - 1] == '\n') + dc->sb.label[size - 1] = '\0'; bch_write_bdev_super(dc, NULL); if (dc->disk.c) { memcpy(dc->disk.c->uuids[dc->disk.id].label, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index da3a99e85b1e..38a43f8d36ec 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) stats->last = now ?: 1; } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done) +/** + * bch_next_delay() - increment @d by the amount of work done, and return how + * long to delay until the next time to do some work. + * + * @d - the struct bch_ratelimit to update + * @done - the amount of work done, in arbitrary units + * + * Returns the amount of time to delay by, in jiffies + */ +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e38c3a..43fd78affcea 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -452,17 +452,23 @@ read_attribute(name ## _last_ ## frequency_units) (ewma) >> factor; \ }) -struct ratelimit { +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ uint64_t next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to bch_next_delay() + */ unsigned rate; }; -static inline void ratelimit_reset(struct ratelimit *d) +static inline void bch_ratelimit_reset(struct bch_ratelimit *d) { d->next = local_clock(); } -unsigned bch_next_delay(struct ratelimit *d, uint64_t done); +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); #define __DIV_SAFE(n, d, zero) \ ({ \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..841f0490d4ef 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -91,11 +91,15 @@ static void update_writeback_rate(struct work_struct *work) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { + uint64_t ret; + if (atomic_read(&dc->disk.detaching) || !dc->writeback_percent) return 0; - return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); + + return min_t(uint64_t, ret, HZ); } /* Background writeback */ @@ -165,7 +169,7 @@ static void refill_dirty(struct closure *cl) up_write(&dc->writeback_lock); - ratelimit_reset(&dc->writeback_rate); + bch_ratelimit_reset(&dc->writeback_rate); /* Punt to workqueue only so we don't recurse and blow the stack */ continue_at(cl, read_dirty, dirty_wq); @@ -246,9 +250,7 @@ static void write_dirty_finish(struct closure *cl) } bch_keybuf_del(&dc->writeback_keys, w); - atomic_dec_bug(&dc->in_flight); - - closure_wake_up(&dc->writeback_wait); + up(&dc->in_flight); closure_return_with_destructor(cl, dirty_io_destructor); } @@ -278,7 +280,7 @@ static void write_dirty(struct closure *cl) trace_bcache_write_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty_finish, dirty_wq); + continue_at(cl, write_dirty_finish, system_wq); } static void read_dirty_endio(struct bio *bio, int error) @@ -299,7 +301,7 @@ static void read_dirty_submit(struct closure *cl) trace_bcache_read_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); - continue_at(cl, write_dirty, dirty_wq); + continue_at(cl, write_dirty, system_wq); } static void read_dirty(struct closure *cl) @@ -324,12 +326,8 @@ static void read_dirty(struct closure *cl) if (delay > 0 && (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) { - w->private = NULL; - - closure_delay(&dc->writeback, delay); - continue_at(cl, read_dirty, dirty_wq); - } + jiffies_to_msecs(delay) > 50)) + delay = schedule_timeout_uninterruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -354,15 +352,10 @@ static void read_dirty(struct closure *cl) pr_debug("%s", pkey(&w->key)); - closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); + down(&dc->in_flight); + closure_call(&io->cl, read_dirty_submit, NULL, cl); delay = writeback_delay(dc, KEY_SIZE(&w->key)); - - atomic_inc(&dc->in_flight); - - if (!closure_wait_event(&dc->writeback_wait, cl, - atomic_read(&dc->in_flight) < 64)) - continue_at(cl, read_dirty, dirty_wq); } if (0) { @@ -372,11 +365,16 @@ err: bch_keybuf_del(&dc->writeback_keys, w); } - refill_dirty(cl); + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + continue_at(cl, refill_dirty, dirty_wq); } void bch_cached_dev_writeback_init(struct cached_dev *dc) { + sema_init(&dc->in_flight, 64); closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); @@ -406,7 +404,7 @@ void bch_writeback_exit(void) int __init bch_writeback_init(void) { - dirty_wq = create_singlethread_workqueue("bcache_writeback"); + dirty_wq = create_workqueue("bcache_writeback"); if (!dirty_wq) return -ENOMEM; |