Blame - drivers/md/raid5-cache.c - kernel/linux-linaro-stable.git

blob: d18a7700bcb9019ff93b63a24720b795479273e6 [file] [log] [blame]

Shaohua Li	f6bed0e	2015-08-13 14:31:59 -0700	[diff] [blame^]	1	/*
				2	* Copyright (C) 2015 Shaohua Li <shli@fb.com>
				3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms and conditions of the GNU General Public License,
				6	* version 2, as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope it will be useful, but WITHOUT
				9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				10	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				11	* more details.
				12	*
				13	*/
				14	#include <linux/kernel.h>
				15	#include <linux/wait.h>
				16	#include <linux/blkdev.h>
				17	#include <linux/slab.h>
				18	#include <linux/raid/md_p.h>
				19	#include <linux/crc32.h>
				20	#include <linux/random.h>
				21	#include "md.h"
				22	#include "raid5.h"
				23
				24	/*
				25	* metadata/data stored in disk with 4k size unit (a block) regardless
				26	* underneath hardware sector size. only works with PAGE_SIZE == 4096
				27	*/
				28	#define BLOCK_SECTORS (8)
				29
				30	struct r5l_log {
				31	struct md_rdev *rdev;
				32
				33	u32 uuid_checksum;
				34
				35	sector_t device_size; /* log device size, round to
				36	* BLOCK_SECTORS */
				37
				38	sector_t last_checkpoint; /* log tail. where recovery scan
				39	* starts from */
				40	u64 last_cp_seq; /* log tail sequence */
				41
				42	sector_t log_start; /* log head. where new data appends */
				43	u64 seq; /* log head sequence */
				44
				45	struct mutex io_mutex;
				46	struct r5l_io_unit current_io; / current io_unit accepting new data */
				47
				48	spinlock_t io_list_lock;
				49	struct list_head running_ios; /* io_units which are still running,
				50	* and have not yet been completely
				51	* written to the log */
				52	struct list_head io_end_ios; /* io_units which have been completely
				53	* written to the log but not yet written
				54	* to the RAID */
				55
				56	struct kmem_cache *io_kc;
				57
				58	struct list_head no_space_stripes; /* pending stripes, log has no space */
				59	spinlock_t no_space_stripes_lock;
				60	};
				61
				62	/*
				63	* an IO range starts from a meta data block and end at the next meta data
				64	* block. The io unit's the meta data block tracks data/parity followed it. io
				65	* unit is written to log disk with normal write, as we always flush log disk
				66	* first and then start move data to raid disks, there is no requirement to
				67	* write io unit with FLUSH/FUA
				68	*/
				69	struct r5l_io_unit {
				70	struct r5l_log *log;
				71
				72	struct page meta_page; / store meta block */
				73	int meta_offset; /* current offset in meta_page */
				74
				75	struct bio_list bios;
				76	atomic_t pending_io; /* pending bios not written to log yet */
				77	struct bio current_bio;/ current_bio accepting new data */
				78
				79	atomic_t pending_stripe;/* how many stripes not flushed to raid */
				80	u64 seq; /* seq number of the metablock */
				81	sector_t log_start; /* where the io_unit starts */
				82	sector_t log_end; /* where the io_unit ends */
				83	struct list_head log_sibling; /* log->running_ios */
				84	struct list_head stripe_list; /* stripes added to the io_unit */
				85
				86	int state;
				87	wait_queue_head_t wait_state;
				88	};
				89
				90	/* r5l_io_unit state */
				91	enum r5l_io_unit_state {
				92	IO_UNIT_RUNNING = 0, /* accepting new IO */
				93	IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
				94	* don't accepting new bio */
				95	IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
				96	IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */
				97	IO_UNIT_STRIPE_END = 4, /* stripes data finished writing to raid */
				98	};
				99
				100	static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
				101	{
				102	start += inc;
				103	if (start >= log->device_size)
				104	start = start - log->device_size;
				105	return start;
				106	}
				107
				108	static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
				109	sector_t end)
				110	{
				111	if (end >= start)
				112	return end - start;
				113	else
				114	return end + log->device_size - start;
				115	}
				116
				117	static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
				118	{
				119	sector_t used_size;
				120
				121	used_size = r5l_ring_distance(log, log->last_checkpoint,
				122	log->log_start);
				123
				124	return log->device_size > used_size + size;
				125	}
				126
				127	static struct r5l_io_unit r5l_alloc_io_unit(struct r5l_log log)
				128	{
				129	struct r5l_io_unit *io;
				130	/* We can't handle memory allocate failure so far */
				131	gfp_t gfp = GFP_NOIO \| __GFP_NOFAIL;
				132
				133	io = kmem_cache_zalloc(log->io_kc, gfp);
				134	io->log = log;
				135	io->meta_page = alloc_page(gfp \| __GFP_ZERO);
				136
				137	bio_list_init(&io->bios);
				138	INIT_LIST_HEAD(&io->log_sibling);
				139	INIT_LIST_HEAD(&io->stripe_list);
				140	io->state = IO_UNIT_RUNNING;
				141	init_waitqueue_head(&io->wait_state);
				142	return io;
				143	}
				144
				145	static void r5l_free_io_unit(struct r5l_log log, struct r5l_io_unit io)
				146	{
				147	__free_page(io->meta_page);
				148	kmem_cache_free(log->io_kc, io);
				149	}
				150
				151	static void r5l_move_io_unit_list(struct list_head from, struct list_head to,
				152	enum r5l_io_unit_state state)
				153	{
				154	struct r5l_io_unit *io;
				155
				156	while (!list_empty(from)) {
				157	io = list_first_entry(from, struct r5l_io_unit, log_sibling);
				158	/* don't change list order */
				159	if (io->state >= state)
				160	list_move_tail(&io->log_sibling, to);
				161	else
				162	break;
				163	}
				164	}
				165
				166	static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
				167	static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
				168	enum r5l_io_unit_state state)
				169	{
				170	struct r5l_log *log = io->log;
				171
				172	if (WARN_ON(io->state >= state))
				173	return;
				174	io->state = state;
				175	if (state == IO_UNIT_IO_END)
				176	r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
				177	IO_UNIT_IO_END);
				178	wake_up(&io->wait_state);
				179	}
				180
				181	static void r5l_set_io_unit_state(struct r5l_io_unit *io,
				182	enum r5l_io_unit_state state)
				183	{
				184	struct r5l_log *log = io->log;
				185	unsigned long flags;
				186
				187	spin_lock_irqsave(&log->io_list_lock, flags);
				188	__r5l_set_io_unit_state(io, state);
				189	spin_unlock_irqrestore(&log->io_list_lock, flags);
				190	}
				191
				192	/* XXX: totally ignores I/O errors */
				193	static void r5l_log_endio(struct bio *bio)
				194	{
				195	struct r5l_io_unit *io = bio->bi_private;
				196	struct r5l_log *log = io->log;
				197
				198	bio_put(bio);
				199
				200	if (!atomic_dec_and_test(&io->pending_io))
				201	return;
				202
				203	r5l_set_io_unit_state(io, IO_UNIT_IO_END);
				204	md_wakeup_thread(log->rdev->mddev->thread);
				205	}
				206
				207	static void r5l_submit_current_io(struct r5l_log *log)
				208	{
				209	struct r5l_io_unit *io = log->current_io;
				210	struct r5l_meta_block *block;
				211	struct bio *bio;
				212	u32 crc;
				213
				214	if (!io)
				215	return;
				216
				217	block = page_address(io->meta_page);
				218	block->meta_size = cpu_to_le32(io->meta_offset);
				219	crc = crc32_le(log->uuid_checksum, (void *)block, PAGE_SIZE);
				220	block->checksum = cpu_to_le32(crc);
				221
				222	log->current_io = NULL;
				223	r5l_set_io_unit_state(io, IO_UNIT_IO_START);
				224
				225	while ((bio = bio_list_pop(&io->bios))) {
				226	/* all IO must start from rdev->data_offset */
				227	bio->bi_iter.bi_sector += log->rdev->data_offset;
				228	submit_bio(WRITE, bio);
				229	}
				230	}
				231
				232	static struct r5l_io_unit r5l_new_meta(struct r5l_log log)
				233	{
				234	struct r5l_io_unit *io;
				235	struct r5l_meta_block *block;
				236	struct bio *bio;
				237
				238	io = r5l_alloc_io_unit(log);
				239
				240	block = page_address(io->meta_page);
				241	block->magic = cpu_to_le32(R5LOG_MAGIC);
				242	block->version = R5LOG_VERSION;
				243	block->seq = cpu_to_le64(log->seq);
				244	block->position = cpu_to_le64(log->log_start);
				245
				246	io->log_start = log->log_start;
				247	io->meta_offset = sizeof(struct r5l_meta_block);
				248	io->seq = log->seq;
				249
				250	bio = bio_kmalloc(GFP_NOIO \| __GFP_NOFAIL, BIO_MAX_PAGES);
				251	io->current_bio = bio;
				252	bio->bi_rw = WRITE;
				253	bio->bi_bdev = log->rdev->bdev;
				254	bio->bi_iter.bi_sector = log->log_start;
				255	bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
				256	bio->bi_end_io = r5l_log_endio;
				257	bio->bi_private = io;
				258
				259	bio_list_add(&io->bios, bio);
				260	atomic_inc(&io->pending_io);
				261
				262	log->seq++;
				263	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
				264	io->log_end = log->log_start;
				265	/* current bio hit disk end */
				266	if (log->log_start == 0)
				267	io->current_bio = NULL;
				268
				269	spin_lock_irq(&log->io_list_lock);
				270	list_add_tail(&io->log_sibling, &log->running_ios);
				271	spin_unlock_irq(&log->io_list_lock);
				272
				273	return io;
				274	}
				275
				276	static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
				277	{
				278	struct r5l_io_unit *io;
				279
				280	io = log->current_io;
				281	if (io && io->meta_offset + payload_size > PAGE_SIZE)
				282	r5l_submit_current_io(log);
				283	io = log->current_io;
				284	if (io)
				285	return 0;
				286
				287	log->current_io = r5l_new_meta(log);
				288	return 0;
				289	}
				290
				291	static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
				292	sector_t location,
				293	u32 checksum1, u32 checksum2,
				294	bool checksum2_valid)
				295	{
				296	struct r5l_io_unit *io = log->current_io;
				297	struct r5l_payload_data_parity *payload;
				298
				299	payload = page_address(io->meta_page) + io->meta_offset;
				300	payload->header.type = cpu_to_le16(type);
				301	payload->header.flags = cpu_to_le16(0);
				302	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
				303	(PAGE_SHIFT - 9));
				304	payload->location = cpu_to_le64(location);
				305	payload->checksum[0] = cpu_to_le32(checksum1);
				306	if (checksum2_valid)
				307	payload->checksum[1] = cpu_to_le32(checksum2);
				308
				309	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
				310	sizeof(__le32) * (1 + !!checksum2_valid);
				311	}
				312
				313	static void r5l_append_payload_page(struct r5l_log log, struct page page)
				314	{
				315	struct r5l_io_unit *io = log->current_io;
				316
				317	alloc_bio:
				318	if (!io->current_bio) {
				319	struct bio *bio;
				320
				321	bio = bio_kmalloc(GFP_NOIO \| __GFP_NOFAIL, BIO_MAX_PAGES);
				322	bio->bi_rw = WRITE;
				323	bio->bi_bdev = log->rdev->bdev;
				324	bio->bi_iter.bi_sector = log->log_start;
				325	bio->bi_end_io = r5l_log_endio;
				326	bio->bi_private = io;
				327	bio_list_add(&io->bios, bio);
				328	atomic_inc(&io->pending_io);
				329	io->current_bio = bio;
				330	}
				331	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
				332	io->current_bio = NULL;
				333	goto alloc_bio;
				334	}
				335	log->log_start = r5l_ring_add(log, log->log_start,
				336	BLOCK_SECTORS);
				337	/* current bio hit disk end */
				338	if (log->log_start == 0)
				339	io->current_bio = NULL;
				340
				341	io->log_end = log->log_start;
				342	}
				343
				344	static void r5l_log_stripe(struct r5l_log log, struct stripe_head sh,
				345	int data_pages, int parity_pages)
				346	{
				347	int i;
				348	int meta_size;
				349	struct r5l_io_unit *io;
				350
				351	meta_size =
				352	((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
				353	* data_pages) +
				354	sizeof(struct r5l_payload_data_parity) +
				355	sizeof(__le32) * parity_pages;
				356
				357	r5l_get_meta(log, meta_size);
				358	io = log->current_io;
				359
				360	for (i = 0; i < sh->disks; i++) {
				361	if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
				362	continue;
				363	if (i == sh->pd_idx \|\| i == sh->qd_idx)
				364	continue;
				365	r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
				366	raid5_compute_blocknr(sh, i, 0),
				367	sh->dev[i].log_checksum, 0, false);
				368	r5l_append_payload_page(log, sh->dev[i].page);
				369	}
				370
				371	if (sh->qd_idx >= 0) {
				372	r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
				373	sh->sector, sh->dev[sh->pd_idx].log_checksum,
				374	sh->dev[sh->qd_idx].log_checksum, true);
				375	r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
				376	r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
				377	} else {
				378	r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
				379	sh->sector, sh->dev[sh->pd_idx].log_checksum,
				380	0, false);
				381	r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
				382	}
				383
				384	list_add_tail(&sh->log_list, &io->stripe_list);
				385	atomic_inc(&io->pending_stripe);
				386	sh->log_io = io;
				387	}
				388
				389	/*
				390	* running in raid5d, where reclaim could wait for raid5d too (when it flushes
				391	* data from log to raid disks), so we shouldn't wait for reclaim here
				392	*/
				393	int r5l_write_stripe(struct r5l_log log, struct stripe_head sh)
				394	{
				395	int write_disks = 0;
				396	int data_pages, parity_pages;
				397	int meta_size;
				398	int reserve;
				399	int i;
				400
				401	if (!log)
				402	return -EAGAIN;
				403	/* Don't support stripe batch */
				404	if (sh->log_io \|\| !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) \|\|
				405	test_bit(STRIPE_SYNCING, &sh->state)) {
				406	/* the stripe is written to log, we start writing it to raid */
				407	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
				408	return -EAGAIN;
				409	}
				410
				411	for (i = 0; i < sh->disks; i++) {
				412	void *addr;
				413
				414	if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
				415	continue;
				416	write_disks++;
				417	/* checksum is already calculated in last run */
				418	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
				419	continue;
				420	addr = kmap_atomic(sh->dev[i].page);
				421	sh->dev[i].log_checksum = crc32_le(log->uuid_checksum,
				422	addr, PAGE_SIZE);
				423	kunmap_atomic(addr);
				424	}
				425	parity_pages = 1 + !!(sh->qd_idx >= 0);
				426	data_pages = write_disks - parity_pages;
				427
				428	meta_size =
				429	((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
				430	* data_pages) +
				431	sizeof(struct r5l_payload_data_parity) +
				432	sizeof(__le32) * parity_pages;
				433	/* Doesn't work with very big raid array */
				434	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
				435	return -EINVAL;
				436
				437	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
				438	atomic_inc(&sh->count);
				439
				440	mutex_lock(&log->io_mutex);
				441	/* meta + data */
				442	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
				443	if (r5l_has_free_space(log, reserve))
				444	r5l_log_stripe(log, sh, data_pages, parity_pages);
				445	else {
				446	spin_lock(&log->no_space_stripes_lock);
				447	list_add_tail(&sh->log_list, &log->no_space_stripes);
				448	spin_unlock(&log->no_space_stripes_lock);
				449
				450	r5l_wake_reclaim(log, reserve);
				451	}
				452	mutex_unlock(&log->io_mutex);
				453
				454	return 0;
				455	}
				456
				457	void r5l_write_stripe_run(struct r5l_log *log)
				458	{
				459	if (!log)
				460	return;
				461	mutex_lock(&log->io_mutex);
				462	r5l_submit_current_io(log);
				463	mutex_unlock(&log->io_mutex);
				464	}
				465
				466	/* This will run after log space is reclaimed */
				467	static void r5l_run_no_space_stripes(struct r5l_log *log)
				468	{
				469	struct stripe_head *sh;
				470
				471	spin_lock(&log->no_space_stripes_lock);
				472	while (!list_empty(&log->no_space_stripes)) {
				473	sh = list_first_entry(&log->no_space_stripes,
				474	struct stripe_head, log_list);
				475	list_del_init(&sh->log_list);
				476	set_bit(STRIPE_HANDLE, &sh->state);
				477	raid5_release_stripe(sh);
				478	}
				479	spin_unlock(&log->no_space_stripes_lock);
				480	}
				481
				482	static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
				483	{
				484	/* will implement later */
				485	}
				486
				487	static int r5l_recovery_log(struct r5l_log *log)
				488	{
				489	/* fake recovery */
				490	log->seq = log->last_cp_seq + 1;
				491	log->log_start = r5l_ring_add(log, log->last_checkpoint, BLOCK_SECTORS);
				492	return 0;
				493	}
				494
				495	static void r5l_write_super(struct r5l_log *log, sector_t cp)
				496	{
				497	struct mddev *mddev = log->rdev->mddev;
				498
				499	log->rdev->journal_tail = cp;
				500	set_bit(MD_CHANGE_DEVS, &mddev->flags);
				501	}
				502
				503	static int r5l_load_log(struct r5l_log *log)
				504	{
				505	struct md_rdev *rdev = log->rdev;
				506	struct page *page;
				507	struct r5l_meta_block *mb;
				508	sector_t cp = log->rdev->journal_tail;
				509	u32 stored_crc, expected_crc;
				510	bool create_super = false;
				511	int ret;
				512
				513	/* Make sure it's valid */
				514	if (cp >= rdev->sectors \|\| round_down(cp, BLOCK_SECTORS) != cp)
				515	cp = 0;
				516	page = alloc_page(GFP_KERNEL);
				517	if (!page)
				518	return -ENOMEM;
				519
				520	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
				521	ret = -EIO;
				522	goto ioerr;
				523	}
				524	mb = page_address(page);
				525
				526	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC \|\|
				527	mb->version != R5LOG_VERSION) {
				528	create_super = true;
				529	goto create;
				530	}
				531	stored_crc = le32_to_cpu(mb->checksum);
				532	mb->checksum = 0;
				533	expected_crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
				534	if (stored_crc != expected_crc) {
				535	create_super = true;
				536	goto create;
				537	}
				538	if (le64_to_cpu(mb->position) != cp) {
				539	create_super = true;
				540	goto create;
				541	}
				542	create:
				543	if (create_super) {
				544	log->last_cp_seq = prandom_u32();
				545	cp = 0;
				546	/*
				547	* Make sure super points to correct address. Log might have
				548	* data very soon. If super hasn't correct log tail address,
				549	* recovery can't find the log
				550	*/
				551	r5l_write_super(log, cp);
				552	} else
				553	log->last_cp_seq = le64_to_cpu(mb->seq);
				554
				555	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
				556	log->last_checkpoint = cp;
				557
				558	__free_page(page);
				559
				560	return r5l_recovery_log(log);
				561	ioerr:
				562	__free_page(page);
				563	return ret;
				564	}
				565
				566	int r5l_init_log(struct r5conf conf, struct md_rdev rdev)
				567	{
				568	struct r5l_log *log;
				569
				570	if (PAGE_SIZE != 4096)
				571	return -EINVAL;
				572	log = kzalloc(sizeof(*log), GFP_KERNEL);
				573	if (!log)
				574	return -ENOMEM;
				575	log->rdev = rdev;
				576
				577	log->uuid_checksum = crc32_le(~0, (void *)rdev->mddev->uuid,
				578	sizeof(rdev->mddev->uuid));
				579
				580	mutex_init(&log->io_mutex);
				581
				582	spin_lock_init(&log->io_list_lock);
				583	INIT_LIST_HEAD(&log->running_ios);
				584
				585	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
				586	if (!log->io_kc)
				587	goto io_kc;
				588
				589	INIT_LIST_HEAD(&log->no_space_stripes);
				590	spin_lock_init(&log->no_space_stripes_lock);
				591
				592	if (r5l_load_log(log))
				593	goto error;
				594
				595	conf->log = log;
				596	return 0;
				597	error:
				598	kmem_cache_destroy(log->io_kc);
				599	io_kc:
				600	kfree(log);
				601	return -EINVAL;
				602	}
				603
				604	void r5l_exit_log(struct r5l_log *log)
				605	{
				606	kmem_cache_destroy(log->io_kc);
				607	kfree(log);
				608	}