blob: d18a7700bcb9019ff93b63a24720b795479273e6 [file] [log] [blame]
Shaohua Lif6bed0e2015-08-13 14:31:59 -07001/*
2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 */
14#include <linux/kernel.h>
15#include <linux/wait.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/raid/md_p.h>
19#include <linux/crc32.h>
20#include <linux/random.h>
21#include "md.h"
22#include "raid5.h"
23
24/*
25 * metadata/data stored in disk with 4k size unit (a block) regardless
26 * underneath hardware sector size. only works with PAGE_SIZE == 4096
27 */
28#define BLOCK_SECTORS (8)
29
30struct r5l_log {
31 struct md_rdev *rdev;
32
33 u32 uuid_checksum;
34
35 sector_t device_size; /* log device size, round to
36 * BLOCK_SECTORS */
37
38 sector_t last_checkpoint; /* log tail. where recovery scan
39 * starts from */
40 u64 last_cp_seq; /* log tail sequence */
41
42 sector_t log_start; /* log head. where new data appends */
43 u64 seq; /* log head sequence */
44
45 struct mutex io_mutex;
46 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
47
48 spinlock_t io_list_lock;
49 struct list_head running_ios; /* io_units which are still running,
50 * and have not yet been completely
51 * written to the log */
52 struct list_head io_end_ios; /* io_units which have been completely
53 * written to the log but not yet written
54 * to the RAID */
55
56 struct kmem_cache *io_kc;
57
58 struct list_head no_space_stripes; /* pending stripes, log has no space */
59 spinlock_t no_space_stripes_lock;
60};
61
62/*
63 * an IO range starts from a meta data block and end at the next meta data
64 * block. The io unit's the meta data block tracks data/parity followed it. io
65 * unit is written to log disk with normal write, as we always flush log disk
66 * first and then start move data to raid disks, there is no requirement to
67 * write io unit with FLUSH/FUA
68 */
69struct r5l_io_unit {
70 struct r5l_log *log;
71
72 struct page *meta_page; /* store meta block */
73 int meta_offset; /* current offset in meta_page */
74
75 struct bio_list bios;
76 atomic_t pending_io; /* pending bios not written to log yet */
77 struct bio *current_bio;/* current_bio accepting new data */
78
79 atomic_t pending_stripe;/* how many stripes not flushed to raid */
80 u64 seq; /* seq number of the metablock */
81 sector_t log_start; /* where the io_unit starts */
82 sector_t log_end; /* where the io_unit ends */
83 struct list_head log_sibling; /* log->running_ios */
84 struct list_head stripe_list; /* stripes added to the io_unit */
85
86 int state;
87 wait_queue_head_t wait_state;
88};
89
90/* r5l_io_unit state */
91enum r5l_io_unit_state {
92 IO_UNIT_RUNNING = 0, /* accepting new IO */
93 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
94 * don't accepting new bio */
95 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
96 IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */
97 IO_UNIT_STRIPE_END = 4, /* stripes data finished writing to raid */
98};
99
100static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
101{
102 start += inc;
103 if (start >= log->device_size)
104 start = start - log->device_size;
105 return start;
106}
107
108static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
109 sector_t end)
110{
111 if (end >= start)
112 return end - start;
113 else
114 return end + log->device_size - start;
115}
116
117static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
118{
119 sector_t used_size;
120
121 used_size = r5l_ring_distance(log, log->last_checkpoint,
122 log->log_start);
123
124 return log->device_size > used_size + size;
125}
126
127static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
128{
129 struct r5l_io_unit *io;
130 /* We can't handle memory allocate failure so far */
131 gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
132
133 io = kmem_cache_zalloc(log->io_kc, gfp);
134 io->log = log;
135 io->meta_page = alloc_page(gfp | __GFP_ZERO);
136
137 bio_list_init(&io->bios);
138 INIT_LIST_HEAD(&io->log_sibling);
139 INIT_LIST_HEAD(&io->stripe_list);
140 io->state = IO_UNIT_RUNNING;
141 init_waitqueue_head(&io->wait_state);
142 return io;
143}
144
145static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
146{
147 __free_page(io->meta_page);
148 kmem_cache_free(log->io_kc, io);
149}
150
151static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
152 enum r5l_io_unit_state state)
153{
154 struct r5l_io_unit *io;
155
156 while (!list_empty(from)) {
157 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
158 /* don't change list order */
159 if (io->state >= state)
160 list_move_tail(&io->log_sibling, to);
161 else
162 break;
163 }
164}
165
166static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
167static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
168 enum r5l_io_unit_state state)
169{
170 struct r5l_log *log = io->log;
171
172 if (WARN_ON(io->state >= state))
173 return;
174 io->state = state;
175 if (state == IO_UNIT_IO_END)
176 r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
177 IO_UNIT_IO_END);
178 wake_up(&io->wait_state);
179}
180
181static void r5l_set_io_unit_state(struct r5l_io_unit *io,
182 enum r5l_io_unit_state state)
183{
184 struct r5l_log *log = io->log;
185 unsigned long flags;
186
187 spin_lock_irqsave(&log->io_list_lock, flags);
188 __r5l_set_io_unit_state(io, state);
189 spin_unlock_irqrestore(&log->io_list_lock, flags);
190}
191
192/* XXX: totally ignores I/O errors */
193static void r5l_log_endio(struct bio *bio)
194{
195 struct r5l_io_unit *io = bio->bi_private;
196 struct r5l_log *log = io->log;
197
198 bio_put(bio);
199
200 if (!atomic_dec_and_test(&io->pending_io))
201 return;
202
203 r5l_set_io_unit_state(io, IO_UNIT_IO_END);
204 md_wakeup_thread(log->rdev->mddev->thread);
205}
206
207static void r5l_submit_current_io(struct r5l_log *log)
208{
209 struct r5l_io_unit *io = log->current_io;
210 struct r5l_meta_block *block;
211 struct bio *bio;
212 u32 crc;
213
214 if (!io)
215 return;
216
217 block = page_address(io->meta_page);
218 block->meta_size = cpu_to_le32(io->meta_offset);
219 crc = crc32_le(log->uuid_checksum, (void *)block, PAGE_SIZE);
220 block->checksum = cpu_to_le32(crc);
221
222 log->current_io = NULL;
223 r5l_set_io_unit_state(io, IO_UNIT_IO_START);
224
225 while ((bio = bio_list_pop(&io->bios))) {
226 /* all IO must start from rdev->data_offset */
227 bio->bi_iter.bi_sector += log->rdev->data_offset;
228 submit_bio(WRITE, bio);
229 }
230}
231
232static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
233{
234 struct r5l_io_unit *io;
235 struct r5l_meta_block *block;
236 struct bio *bio;
237
238 io = r5l_alloc_io_unit(log);
239
240 block = page_address(io->meta_page);
241 block->magic = cpu_to_le32(R5LOG_MAGIC);
242 block->version = R5LOG_VERSION;
243 block->seq = cpu_to_le64(log->seq);
244 block->position = cpu_to_le64(log->log_start);
245
246 io->log_start = log->log_start;
247 io->meta_offset = sizeof(struct r5l_meta_block);
248 io->seq = log->seq;
249
250 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
251 io->current_bio = bio;
252 bio->bi_rw = WRITE;
253 bio->bi_bdev = log->rdev->bdev;
254 bio->bi_iter.bi_sector = log->log_start;
255 bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
256 bio->bi_end_io = r5l_log_endio;
257 bio->bi_private = io;
258
259 bio_list_add(&io->bios, bio);
260 atomic_inc(&io->pending_io);
261
262 log->seq++;
263 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
264 io->log_end = log->log_start;
265 /* current bio hit disk end */
266 if (log->log_start == 0)
267 io->current_bio = NULL;
268
269 spin_lock_irq(&log->io_list_lock);
270 list_add_tail(&io->log_sibling, &log->running_ios);
271 spin_unlock_irq(&log->io_list_lock);
272
273 return io;
274}
275
276static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
277{
278 struct r5l_io_unit *io;
279
280 io = log->current_io;
281 if (io && io->meta_offset + payload_size > PAGE_SIZE)
282 r5l_submit_current_io(log);
283 io = log->current_io;
284 if (io)
285 return 0;
286
287 log->current_io = r5l_new_meta(log);
288 return 0;
289}
290
291static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
292 sector_t location,
293 u32 checksum1, u32 checksum2,
294 bool checksum2_valid)
295{
296 struct r5l_io_unit *io = log->current_io;
297 struct r5l_payload_data_parity *payload;
298
299 payload = page_address(io->meta_page) + io->meta_offset;
300 payload->header.type = cpu_to_le16(type);
301 payload->header.flags = cpu_to_le16(0);
302 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
303 (PAGE_SHIFT - 9));
304 payload->location = cpu_to_le64(location);
305 payload->checksum[0] = cpu_to_le32(checksum1);
306 if (checksum2_valid)
307 payload->checksum[1] = cpu_to_le32(checksum2);
308
309 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
310 sizeof(__le32) * (1 + !!checksum2_valid);
311}
312
313static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
314{
315 struct r5l_io_unit *io = log->current_io;
316
317alloc_bio:
318 if (!io->current_bio) {
319 struct bio *bio;
320
321 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
322 bio->bi_rw = WRITE;
323 bio->bi_bdev = log->rdev->bdev;
324 bio->bi_iter.bi_sector = log->log_start;
325 bio->bi_end_io = r5l_log_endio;
326 bio->bi_private = io;
327 bio_list_add(&io->bios, bio);
328 atomic_inc(&io->pending_io);
329 io->current_bio = bio;
330 }
331 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
332 io->current_bio = NULL;
333 goto alloc_bio;
334 }
335 log->log_start = r5l_ring_add(log, log->log_start,
336 BLOCK_SECTORS);
337 /* current bio hit disk end */
338 if (log->log_start == 0)
339 io->current_bio = NULL;
340
341 io->log_end = log->log_start;
342}
343
344static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
345 int data_pages, int parity_pages)
346{
347 int i;
348 int meta_size;
349 struct r5l_io_unit *io;
350
351 meta_size =
352 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
353 * data_pages) +
354 sizeof(struct r5l_payload_data_parity) +
355 sizeof(__le32) * parity_pages;
356
357 r5l_get_meta(log, meta_size);
358 io = log->current_io;
359
360 for (i = 0; i < sh->disks; i++) {
361 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
362 continue;
363 if (i == sh->pd_idx || i == sh->qd_idx)
364 continue;
365 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
366 raid5_compute_blocknr(sh, i, 0),
367 sh->dev[i].log_checksum, 0, false);
368 r5l_append_payload_page(log, sh->dev[i].page);
369 }
370
371 if (sh->qd_idx >= 0) {
372 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
373 sh->sector, sh->dev[sh->pd_idx].log_checksum,
374 sh->dev[sh->qd_idx].log_checksum, true);
375 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
376 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
377 } else {
378 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
379 sh->sector, sh->dev[sh->pd_idx].log_checksum,
380 0, false);
381 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
382 }
383
384 list_add_tail(&sh->log_list, &io->stripe_list);
385 atomic_inc(&io->pending_stripe);
386 sh->log_io = io;
387}
388
389/*
390 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
391 * data from log to raid disks), so we shouldn't wait for reclaim here
392 */
393int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
394{
395 int write_disks = 0;
396 int data_pages, parity_pages;
397 int meta_size;
398 int reserve;
399 int i;
400
401 if (!log)
402 return -EAGAIN;
403 /* Don't support stripe batch */
404 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
405 test_bit(STRIPE_SYNCING, &sh->state)) {
406 /* the stripe is written to log, we start writing it to raid */
407 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
408 return -EAGAIN;
409 }
410
411 for (i = 0; i < sh->disks; i++) {
412 void *addr;
413
414 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
415 continue;
416 write_disks++;
417 /* checksum is already calculated in last run */
418 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
419 continue;
420 addr = kmap_atomic(sh->dev[i].page);
421 sh->dev[i].log_checksum = crc32_le(log->uuid_checksum,
422 addr, PAGE_SIZE);
423 kunmap_atomic(addr);
424 }
425 parity_pages = 1 + !!(sh->qd_idx >= 0);
426 data_pages = write_disks - parity_pages;
427
428 meta_size =
429 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
430 * data_pages) +
431 sizeof(struct r5l_payload_data_parity) +
432 sizeof(__le32) * parity_pages;
433 /* Doesn't work with very big raid array */
434 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
435 return -EINVAL;
436
437 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
438 atomic_inc(&sh->count);
439
440 mutex_lock(&log->io_mutex);
441 /* meta + data */
442 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
443 if (r5l_has_free_space(log, reserve))
444 r5l_log_stripe(log, sh, data_pages, parity_pages);
445 else {
446 spin_lock(&log->no_space_stripes_lock);
447 list_add_tail(&sh->log_list, &log->no_space_stripes);
448 spin_unlock(&log->no_space_stripes_lock);
449
450 r5l_wake_reclaim(log, reserve);
451 }
452 mutex_unlock(&log->io_mutex);
453
454 return 0;
455}
456
457void r5l_write_stripe_run(struct r5l_log *log)
458{
459 if (!log)
460 return;
461 mutex_lock(&log->io_mutex);
462 r5l_submit_current_io(log);
463 mutex_unlock(&log->io_mutex);
464}
465
466/* This will run after log space is reclaimed */
467static void r5l_run_no_space_stripes(struct r5l_log *log)
468{
469 struct stripe_head *sh;
470
471 spin_lock(&log->no_space_stripes_lock);
472 while (!list_empty(&log->no_space_stripes)) {
473 sh = list_first_entry(&log->no_space_stripes,
474 struct stripe_head, log_list);
475 list_del_init(&sh->log_list);
476 set_bit(STRIPE_HANDLE, &sh->state);
477 raid5_release_stripe(sh);
478 }
479 spin_unlock(&log->no_space_stripes_lock);
480}
481
482static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
483{
484 /* will implement later */
485}
486
487static int r5l_recovery_log(struct r5l_log *log)
488{
489 /* fake recovery */
490 log->seq = log->last_cp_seq + 1;
491 log->log_start = r5l_ring_add(log, log->last_checkpoint, BLOCK_SECTORS);
492 return 0;
493}
494
495static void r5l_write_super(struct r5l_log *log, sector_t cp)
496{
497 struct mddev *mddev = log->rdev->mddev;
498
499 log->rdev->journal_tail = cp;
500 set_bit(MD_CHANGE_DEVS, &mddev->flags);
501}
502
503static int r5l_load_log(struct r5l_log *log)
504{
505 struct md_rdev *rdev = log->rdev;
506 struct page *page;
507 struct r5l_meta_block *mb;
508 sector_t cp = log->rdev->journal_tail;
509 u32 stored_crc, expected_crc;
510 bool create_super = false;
511 int ret;
512
513 /* Make sure it's valid */
514 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
515 cp = 0;
516 page = alloc_page(GFP_KERNEL);
517 if (!page)
518 return -ENOMEM;
519
520 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
521 ret = -EIO;
522 goto ioerr;
523 }
524 mb = page_address(page);
525
526 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
527 mb->version != R5LOG_VERSION) {
528 create_super = true;
529 goto create;
530 }
531 stored_crc = le32_to_cpu(mb->checksum);
532 mb->checksum = 0;
533 expected_crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
534 if (stored_crc != expected_crc) {
535 create_super = true;
536 goto create;
537 }
538 if (le64_to_cpu(mb->position) != cp) {
539 create_super = true;
540 goto create;
541 }
542create:
543 if (create_super) {
544 log->last_cp_seq = prandom_u32();
545 cp = 0;
546 /*
547 * Make sure super points to correct address. Log might have
548 * data very soon. If super hasn't correct log tail address,
549 * recovery can't find the log
550 */
551 r5l_write_super(log, cp);
552 } else
553 log->last_cp_seq = le64_to_cpu(mb->seq);
554
555 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
556 log->last_checkpoint = cp;
557
558 __free_page(page);
559
560 return r5l_recovery_log(log);
561ioerr:
562 __free_page(page);
563 return ret;
564}
565
566int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
567{
568 struct r5l_log *log;
569
570 if (PAGE_SIZE != 4096)
571 return -EINVAL;
572 log = kzalloc(sizeof(*log), GFP_KERNEL);
573 if (!log)
574 return -ENOMEM;
575 log->rdev = rdev;
576
577 log->uuid_checksum = crc32_le(~0, (void *)rdev->mddev->uuid,
578 sizeof(rdev->mddev->uuid));
579
580 mutex_init(&log->io_mutex);
581
582 spin_lock_init(&log->io_list_lock);
583 INIT_LIST_HEAD(&log->running_ios);
584
585 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
586 if (!log->io_kc)
587 goto io_kc;
588
589 INIT_LIST_HEAD(&log->no_space_stripes);
590 spin_lock_init(&log->no_space_stripes_lock);
591
592 if (r5l_load_log(log))
593 goto error;
594
595 conf->log = log;
596 return 0;
597error:
598 kmem_cache_destroy(log->io_kc);
599io_kc:
600 kfree(log);
601 return -EINVAL;
602}
603
604void r5l_exit_log(struct r5l_log *log)
605{
606 kmem_cache_destroy(log->io_kc);
607 kfree(log);
608}