aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-05-01 09:23:05 +0200
committerJens Axboe <axboe@kernel.dk>2013-05-01 09:23:05 +0200
commitf50efd2fdbd9b35b11f5778ed85beb764184bda9 (patch)
treeed92b40995d60136fb387d210886e4aae2c37231
parent0821e904057505c7e25d72e1a282105d023b26c9 (diff)
parentee66850642efda91d04179cae2414310675a1f73 (diff)
Merge branch 'bcache-for-upstream' of http://evilpiepirate.org/git/linux-bcache into for-3.10/drivers
Kent writes: Hey Jens, this is everything I've got ready for 3.10 - there's _still_ one more bug I'm trying to track down. Andrew - I've got patches that rip out the pkey() and pbtree() macros, but they're somewhat tied up with some other nontrivial refactorings so I think I'm going to wait a bit on those.
-rw-r--r--drivers/md/bcache/alloc.c72
-rw-r--r--drivers/md/bcache/bcache.h47
-rw-r--r--drivers/md/bcache/btree.c3
-rw-r--r--drivers/md/bcache/io.c35
-rw-r--r--drivers/md/bcache/request.c2
-rw-r--r--drivers/md/bcache/super.c166
6 files changed, 213 insertions, 112 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 2879487d036..048f2947e08 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -243,31 +243,37 @@ static void invalidate_buckets_lru(struct cache *ca)
ca->heap.used = 0;
for_each_bucket(b, ca) {
+ /*
+ * If we fill up the unused list, if we then return before
+ * adding anything to the free_inc list we'll skip writing
+ * prios/gens and just go back to allocating from the unused
+ * list:
+ */
+ if (fifo_full(&ca->unused))
+ return;
+
if (!can_invalidate_bucket(ca, b))
continue;
- if (!GC_SECTORS_USED(b)) {
- if (!bch_bucket_add_unused(ca, b))
- return;
- } else {
- if (!heap_full(&ca->heap))
- heap_add(&ca->heap, b, bucket_max_cmp);
- else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
- ca->heap.data[0] = b;
- heap_sift(&ca->heap, 0, bucket_max_cmp);
- }
+ if (!GC_SECTORS_USED(b) &&
+ bch_bucket_add_unused(ca, b))
+ continue;
+
+ if (!heap_full(&ca->heap))
+ heap_add(&ca->heap, b, bucket_max_cmp);
+ else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+ ca->heap.data[0] = b;
+ heap_sift(&ca->heap, 0, bucket_max_cmp);
}
}
- if (ca->heap.used * 2 < ca->heap.size)
- bch_queue_gc(ca->set);
-
for (i = ca->heap.used / 2 - 1; i >= 0; --i)
heap_sift(&ca->heap, i, bucket_min_cmp);
while (!fifo_full(&ca->free_inc)) {
if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
- /* We don't want to be calling invalidate_buckets()
+ /*
+ * We don't want to be calling invalidate_buckets()
* multiple times when it can't do anything
*/
ca->invalidate_needs_gc = 1;
@@ -343,15 +349,22 @@ static void invalidate_buckets(struct cache *ca)
invalidate_buckets_random(ca);
break;
}
+
+ pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
+ fifo_used(&ca->free), ca->free.size,
+ fifo_used(&ca->free_inc), ca->free_inc.size,
+ fifo_used(&ca->unused), ca->unused.size);
}
#define allocator_wait(ca, cond) \
do { \
DEFINE_WAIT(__wait); \
\
- while (!(cond)) { \
+ while (1) { \
prepare_to_wait(&ca->set->alloc_wait, \
&__wait, TASK_INTERRUPTIBLE); \
+ if (cond) \
+ break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
@@ -360,7 +373,6 @@ do { \
} \
\
schedule(); \
- __set_current_state(TASK_RUNNING); \
mutex_lock(&(ca)->set->bucket_lock); \
} \
\
@@ -374,6 +386,11 @@ void bch_allocator_thread(struct closure *cl)
mutex_lock(&ca->set->bucket_lock);
while (1) {
+ /*
+ * First, we pull buckets off of the unused and free_inc lists,
+ * possibly issue discards to them, then we add the bucket to
+ * the free list:
+ */
while (1) {
long bucket;
@@ -398,17 +415,26 @@ void bch_allocator_thread(struct closure *cl)
}
}
- allocator_wait(ca, ca->set->gc_mark_valid);
- invalidate_buckets(ca);
+ /*
+ * We've run out of free buckets, we need to find some buckets
+ * we can invalidate. First, invalidate them in memory and add
+ * them to the free_inc list:
+ */
- allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) ||
- !CACHE_SYNC(&ca->set->sb));
+ allocator_wait(ca, ca->set->gc_mark_valid &&
+ (ca->need_save_prio > 64 ||
+ !ca->invalidate_needs_gc));
+ invalidate_buckets(ca);
+ /*
+ * Now, we write their new gens to disk so we can start writing
+ * new stuff to them:
+ */
+ allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
if (CACHE_SYNC(&ca->set->sb) &&
(!fifo_empty(&ca->free_inc) ||
- ca->need_save_prio > 64)) {
+ ca->need_save_prio > 64))
bch_prio_write(ca);
- }
}
}
@@ -475,7 +501,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
for (i = 0; i < KEY_PTRS(k); i++) {
struct bucket *b = PTR_BUCKET(c, k, i);
- SET_GC_MARK(b, 0);
+ SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
SET_GC_SECTORS_USED(b, 0);
bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index f05723565f1..340146d7c17 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -223,11 +223,17 @@ struct bkey {
#define BKEY_PADDED(key) \
union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
-/* Version 1: Backing device
+/* Version 0: Cache device
+ * Version 1: Backing device
* Version 2: Seed pointer into btree node checksum
- * Version 3: New UUID format
+ * Version 3: Cache device with new UUID format
+ * Version 4: Backing device with data offset
*/
-#define BCACHE_SB_VERSION 3
+#define BCACHE_SB_VERSION_CDEV 0
+#define BCACHE_SB_VERSION_BDEV 1
+#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
+#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
+#define BCACHE_SB_MAX_VERSION 4
#define SB_SECTOR 8
#define SB_SIZE 4096
@@ -236,13 +242,12 @@ struct bkey {
/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
#define MAX_CACHES_PER_SET 8
-#define BDEV_DATA_START 16 /* sectors */
+#define BDEV_DATA_START_DEFAULT 16 /* sectors */
struct cache_sb {
uint64_t csum;
uint64_t offset; /* sector where this sb was written */
uint64_t version;
-#define CACHE_BACKING_DEV 1
uint8_t magic[16];
@@ -257,12 +262,28 @@ struct cache_sb {
uint64_t seq;
uint64_t pad[8];
- uint64_t nbuckets; /* device size */
- uint16_t block_size; /* sectors */
- uint16_t bucket_size; /* sectors */
+ union {
+ struct {
+ /* Cache devices */
+ uint64_t nbuckets; /* device size */
+
+ uint16_t block_size; /* sectors */
+ uint16_t bucket_size; /* sectors */
- uint16_t nr_in_set;
- uint16_t nr_this_dev;
+ uint16_t nr_in_set;
+ uint16_t nr_this_dev;
+ };
+ struct {
+ /* Backing devices */
+ uint64_t data_offset;
+
+ /*
+ * block_size from the cache device section is still used by
+ * backing devices, so don't add anything here until we fix
+ * things to not need it for backing devices anymore
+ */
+ };
+ };
uint32_t last_mount; /* time_t */
@@ -861,6 +882,12 @@ static inline bool key_merging_disabled(struct cache_set *c)
#endif
}
+static inline bool SB_IS_BDEV(const struct cache_sb *sb)
+{
+ return sb->version == BCACHE_SB_VERSION_BDEV
+ || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
+}
+
struct bbio {
unsigned submit_time_us;
union {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 85234079377..7a5658f04e6 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -984,7 +984,7 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
if (b->prio_blocked &&
!atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
- closure_wake_up(&b->c->bucket_wait);
+ wake_up(&b->c->alloc_wait);
b->prio_blocked = 0;
@@ -1548,7 +1548,6 @@ static void bch_btree_gc(struct closure *cl)
trace_bcache_gc_end(c->sb.set_uuid);
wake_up(&c->alloc_wait);
- closure_wake_up(&c->bucket_wait);
continue_at(cl, bch_moving_gc, bch_gc_wq);
}
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index f565512f6fa..48efd4dea64 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -38,6 +38,15 @@ static void bch_generic_make_request_hack(struct bio *bio)
bio = clone;
}
+ /*
+ * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
+ * bios might have had more than that (before we split them per device
+ * limitations).
+ *
+ * To be taken out once immutable bvec stuff is in.
+ */
+ bio->bi_max_vecs = bio->bi_vcnt;
+
generic_make_request(bio);
}
@@ -149,34 +158,32 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
{
unsigned ret = bio_sectors(bio);
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
+ queue_max_segments(q));
struct bio_vec *bv, *end = bio_iovec(bio) +
- min_t(int, bio_segments(bio), queue_max_segments(q));
-
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_sector,
- .bi_size = 0,
- .bi_rw = bio->bi_rw,
- };
+ min_t(int, bio_segments(bio), max_segments);
if (bio->bi_rw & REQ_DISCARD)
return min(ret, q->limits.max_discard_sectors);
- if (bio_segments(bio) > queue_max_segments(q) ||
+ if (bio_segments(bio) > max_segments ||
q->merge_bvec_fn) {
ret = 0;
for (bv = bio_iovec(bio); bv < end; bv++) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = ret << 9,
+ .bi_rw = bio->bi_rw,
+ };
+
if (q->merge_bvec_fn &&
q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
break;
- ret += bv->bv_len >> 9;
- bvm.bi_size += bv->bv_len;
+ ret += bv->bv_len >> 9;
}
-
- if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9)
- return (BIO_MAX_PAGES * PAGE_SIZE) >> 9;
}
ret = min(ret, queue_max_sectors(q));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 83731dc36f3..e5ff12e52d5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1220,7 +1220,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
part_stat_unlock();
bio->bi_bdev = dc->bdev;
- bio->bi_sector += BDEV_DATA_START;
+ bio->bi_sector += dc->sb.data_offset;
if (cached_dev_get(dc)) {
s = search_alloc(bio, d);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 5fa3cd2d9ff..c8046bc4aa5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -110,15 +110,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->flags = le64_to_cpu(s->flags);
sb->seq = le64_to_cpu(s->seq);
-
- sb->nbuckets = le64_to_cpu(s->nbuckets);
- sb->block_size = le16_to_cpu(s->block_size);
- sb->bucket_size = le16_to_cpu(s->bucket_size);
-
- sb->nr_in_set = le16_to_cpu(s->nr_in_set);
- sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
sb->last_mount = le32_to_cpu(s->last_mount);
-
sb->first_bucket = le16_to_cpu(s->first_bucket);
sb->keys = le16_to_cpu(s->keys);
@@ -147,53 +139,81 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
if (bch_is_zero(sb->uuid, 16))
goto err;
- err = "Unsupported superblock version";
- if (sb->version > BCACHE_SB_VERSION)
- goto err;
+ sb->block_size = le16_to_cpu(s->block_size);
- err = "Bad block/bucket size";
- if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS ||
- !is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS)
+ err = "Superblock block size smaller than device block size";
+ if (sb->block_size << 9 < bdev_logical_block_size(bdev))
goto err;
- err = "Too many buckets";
- if (sb->nbuckets > LONG_MAX)
- goto err;
+ switch (sb->version) {
+ case BCACHE_SB_VERSION_BDEV:
+ sb->data_offset = BDEV_DATA_START_DEFAULT;
+ break;
+ case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+ sb->data_offset = le64_to_cpu(s->data_offset);
- err = "Not enough buckets";
- if (sb->nbuckets < 1 << 7)
- goto err;
+ err = "Bad data offset";
+ if (sb->data_offset < BDEV_DATA_START_DEFAULT)
+ goto err;
- err = "Invalid superblock: device too small";
- if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
- goto err;
+ break;
+ case BCACHE_SB_VERSION_CDEV:
+ case BCACHE_SB_VERSION_CDEV_WITH_UUID:
+ sb->nbuckets = le64_to_cpu(s->nbuckets);
+ sb->block_size = le16_to_cpu(s->block_size);
+ sb->bucket_size = le16_to_cpu(s->bucket_size);
- if (sb->version == CACHE_BACKING_DEV)
- goto out;
+ sb->nr_in_set = le16_to_cpu(s->nr_in_set);
+ sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
- err = "Bad UUID";
- if (bch_is_zero(sb->set_uuid, 16))
- goto err;
+ err = "Too many buckets";
+ if (sb->nbuckets > LONG_MAX)
+ goto err;
- err = "Bad cache device number in set";
- if (!sb->nr_in_set ||
- sb->nr_in_set <= sb->nr_this_dev ||
- sb->nr_in_set > MAX_CACHES_PER_SET)
- goto err;
+ err = "Not enough buckets";
+ if (sb->nbuckets < 1 << 7)
+ goto err;
- err = "Journal buckets not sequential";
- for (i = 0; i < sb->keys; i++)
- if (sb->d[i] != sb->first_bucket + i)
+ err = "Bad block/bucket size";
+ if (!is_power_of_2(sb->block_size) ||
+ sb->block_size > PAGE_SECTORS ||
+ !is_power_of_2(sb->bucket_size) ||
+ sb->bucket_size < PAGE_SECTORS)
goto err;
- err = "Too many journal buckets";
- if (sb->first_bucket + sb->keys > sb->nbuckets)
- goto err;
+ err = "Invalid superblock: device too small";
+ if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
+ goto err;
+
+ err = "Bad UUID";
+ if (bch_is_zero(sb->set_uuid, 16))
+ goto err;
+
+ err = "Bad cache device number in set";
+ if (!sb->nr_in_set ||
+ sb->nr_in_set <= sb->nr_this_dev ||
+ sb->nr_in_set > MAX_CACHES_PER_SET)
+ goto err;
- err = "Invalid superblock: first bucket comes before end of super";
- if (sb->first_bucket * sb->bucket_size < 16)
+ err = "Journal buckets not sequential";
+ for (i = 0; i < sb->keys; i++)
+ if (sb->d[i] != sb->first_bucket + i)
+ goto err;
+
+ err = "Too many journal buckets";
+ if (sb->first_bucket + sb->keys > sb->nbuckets)
+ goto err;
+
+ err = "Invalid superblock: first bucket comes before end of super";
+ if (sb->first_bucket * sb->bucket_size < 16)
+ goto err;
+
+ break;
+ default:
+ err = "Unsupported superblock version";
goto err;
-out:
+ }
+
sb->last_mount = get_seconds();
err = NULL;
@@ -286,7 +306,7 @@ void bcache_write_super(struct cache_set *c)
for_each_cache(ca, c, i) {
struct bio *bio = &ca->sb_bio;
- ca->sb.version = BCACHE_SB_VERSION;
+ ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
ca->sb.seq = c->sb.seq;
ca->sb.last_mount = c->sb.last_mount;
@@ -641,6 +661,35 @@ void bcache_device_stop(struct bcache_device *d)
closure_queue(&d->cl);
}
+static void bcache_device_unlink(struct bcache_device *d)
+{
+ unsigned i;
+ struct cache *ca;
+
+ sysfs_remove_link(&d->c->kobj, d->name);
+ sysfs_remove_link(&d->kobj, "cache");
+
+ for_each_cache(ca, d->c, i)
+ bd_unlink_disk_holder(ca->bdev, d->disk);
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+ const char *name)
+{
+ unsigned i;
+ struct cache *ca;
+
+ for_each_cache(ca, d->c, i)
+ bd_link_disk_holder(ca->bdev, d->disk);
+
+ snprintf(d->name, BCACHEDEVNAME_SIZE,
+ "%s%u", name, d->id);
+
+ WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+ sysfs_create_link(&c->kobj, &d->kobj, d->name),
+ "Couldn't create device <-> cache set symlinks");
+}
+
static void bcache_device_detach(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
@@ -656,6 +705,8 @@ static void bcache_device_detach(struct bcache_device *d)
atomic_set(&d->detaching, 0);
}
+ bcache_device_unlink(d);
+
d->c->devices[d->id] = NULL;
closure_put(&d->c->caching);
d->c = NULL;
@@ -673,17 +724,6 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
closure_get(&c->caching);
}
-static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
- const char *name)
-{
- snprintf(d->name, BCACHEDEVNAME_SIZE,
- "%s%u", name, d->id);
-
- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
- sysfs_create_link(&c->kobj, &d->kobj, d->name),
- "Couldn't create device <-> cache set symlinks");
-}
-
static void bcache_device_free(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
@@ -784,6 +824,7 @@ void bch_cached_dev_run(struct cached_dev *dc)
}
add_disk(d->disk);
+ bd_link_disk_holder(dc->bdev, dc->disk.disk);
#if 0
char *env[] = { "SYMLINK=label" , NULL };
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
@@ -803,9 +844,6 @@ static void cached_dev_detach_finish(struct work_struct *w)
BUG_ON(!atomic_read(&dc->disk.detaching));
BUG_ON(atomic_read(&dc->count));
- sysfs_remove_link(&dc->disk.c->kobj, dc->disk.name);
- sysfs_remove_link(&dc->disk.kobj, "cache");
-
mutex_lock(&bch_register_lock);
memset(&dc->sb.set_uuid, 0, 16);
@@ -920,7 +958,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
bcache_device_attach(&dc->disk, c, u - c->uuids);
- bcache_device_link(&dc->disk, c, "bdev");
list_move(&dc->list, &c->cached_devs);
calc_cached_dev_sectors(c);
@@ -938,6 +975,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
bch_cached_dev_run(dc);
+ bcache_device_link(&dc->disk, c, "bdev");
pr_info("Caching %s as %s on set %pU",
bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
@@ -961,6 +999,7 @@ static void cached_dev_free(struct closure *cl)
mutex_lock(&bch_register_lock);
+ bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
list_del(&dc->list);
@@ -1049,7 +1088,11 @@ static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
g = dc->disk.disk;
- set_capacity(g, dc->bdev->bd_part->nr_sects - 16);
+ set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+
+ g->queue->backing_dev_info.ra_pages =
+ max(g->queue->backing_dev_info.ra_pages,
+ bdev->bd_queue->backing_dev_info.ra_pages);
bch_cached_dev_request_init(dc);
@@ -1099,8 +1142,7 @@ static void flash_dev_flush(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
- sysfs_remove_link(&d->c->kobj, d->name);
- sysfs_remove_link(&d->kobj, "cache");
+ bcache_device_unlink(d);
kobject_del(&d->kobj);
continue_at(cl, flash_dev_free, system_wq);
}
@@ -1802,7 +1844,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (err)
goto err_close;
- if (sb->version == CACHE_BACKING_DEV) {
+ if (SB_IS_BDEV(sb)) {
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
err = register_bdev(sb, sb_page, bdev, dc);