Blame - fs/btrfs/raid56.c - kernel/linux-linaro-stable.git

blob: c54b0e64c5906ca9eb682624d96dc40d53bc9328 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
Geert Uytterhoeven	d7011f5	2013-03-03 04:44:41 -0700	[diff] [blame]	34	#include <linux/vmalloc.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	61	#define RBIO_CACHE_SIZE 1024
				62
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	63	struct btrfs_raid_bio {
				64	struct btrfs_fs_info *fs_info;
				65	struct btrfs_bio *bbio;
				66
				67	/*
				68	* logical block numbers for the start of each stripe
				69	* The last one or two are p/q. These are sorted,
				70	* so raid_map[0] is the start of our full stripe
				71	*/
				72	u64 *raid_map;
				73
				74	/* while we're doing rmw on a stripe
				75	* we put it into a hash table so we can
				76	* lock the stripe and merge more rbios
				77	* into it.
				78	*/
				79	struct list_head hash_list;
				80
				81	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	82	* LRU list for the stripe cache
				83	*/
				84	struct list_head stripe_cache;
				85
				86	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	87	* for scheduling work in the helper threads
				88	*/
				89	struct btrfs_work work;
				90
				91	/*
				92	* bio list and bio_list_lock are used
				93	* to add more bios into the stripe
				94	* in hopes of avoiding the full rmw
				95	*/
				96	struct bio_list bio_list;
				97	spinlock_t bio_list_lock;
				98
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	99	/* also protected by the bio_list_lock, the
				100	* plug list is used by the plugging code
				101	* to collect partial bios while plugged. The
				102	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	103	* the stripe lock to the next pending IO
				104	*/
				105	struct list_head plug_list;
				106
				107	/*
				108	* flags that tell us if it is safe to
				109	* merge with this bio
				110	*/
				111	unsigned long flags;
				112
				113	/* size of each individual stripe on disk */
				114	int stripe_len;
				115
				116	/* number of data stripes (no p/q) */
				117	int nr_data;
				118
				119	/*
				120	* set if we're doing a parity rebuild
				121	* for a read from higher up, which is handled
				122	* differently from a parity rebuild as part of
				123	* rmw
				124	*/
				125	int read_rebuild;
				126
				127	/* first bad stripe */
				128	int faila;
				129
				130	/* second bad stripe (for raid6 use) */
				131	int failb;
				132
				133	/*
				134	* number of pages needed to represent the full
				135	* stripe
				136	*/
				137	int nr_pages;
				138
				139	/*
				140	* size of all the bios in the bio_list. This
				141	* helps us decide if the rbio maps to a full
				142	* stripe or not
				143	*/
				144	int bio_list_bytes;
				145
				146	atomic_t refs;
				147
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	148
				149	atomic_t stripes_pending;
				150
				151	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	152	/*
				153	* these are two arrays of pointers. We allocate the
				154	* rbio big enough to hold them both and setup their
				155	* locations when the rbio is allocated
				156	*/
				157
				158	/* pointers to pages that we allocated for
				159	* reading/writing stripes directly from the disk (including P/Q)
				160	*/
				161	struct page **stripe_pages;
				162
				163	/*
				164	* pointers to the pages in the bio_list. Stored
				165	* here for faster lookup
				166	*/
				167	struct page **bio_pages;
				168	};
				169
				170	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				171	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				172	static void rmw_work(struct btrfs_work *work);
				173	static void read_rebuild_work(struct btrfs_work *work);
				174	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				175	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				176	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				177	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				178	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				179	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				180	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				181
				182	/*
				183	* the stripe hash table is used for locking, and to collect
				184	* bios in hopes of making a full stripe
				185	*/
				186	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				187	{
				188	struct btrfs_stripe_hash_table *table;
				189	struct btrfs_stripe_hash_table *x;
				190	struct btrfs_stripe_hash *cur;
				191	struct btrfs_stripe_hash *h;
				192	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				193	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	194	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	195
				196	if (info->stripe_hash_table)
				197	return 0;
				198
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	199	/*
				200	* The table is large, starting with order 4 and can go as high as
				201	* order 7 in case lock debugging is turned on.
				202	*
				203	* Try harder to allocate and fallback to vmalloc to lower the chance
				204	* of a failing mount.
				205	*/
				206	table_size = sizeof(table) + sizeof(h) * num_entries;
				207	table = kzalloc(table_size, GFP_KERNEL \| __GFP_NOWARN \| __GFP_REPEAT);
				208	if (!table) {
				209	table = vzalloc(table_size);
				210	if (!table)
				211	return -ENOMEM;
				212	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	213
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	214	spin_lock_init(&table->cache_lock);
				215	INIT_LIST_HEAD(&table->stripe_cache);
				216
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	217	h = table->table;
				218
				219	for (i = 0; i < num_entries; i++) {
				220	cur = h + i;
				221	INIT_LIST_HEAD(&cur->hash_list);
				222	spin_lock_init(&cur->lock);
				223	init_waitqueue_head(&cur->wait);
				224	}
				225
				226	x = cmpxchg(&info->stripe_hash_table, NULL, table);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	227	if (x) {
				228	if (is_vmalloc_addr(x))
				229	vfree(x);
				230	else
				231	kfree(x);
				232	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	233	return 0;
				234	}
				235
				236	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	237	* caching an rbio means to copy anything from the
				238	* bio_pages array into the stripe_pages array. We
				239	* use the page uptodate bit in the stripe cache array
				240	* to indicate if it has valid data
				241	*
				242	* once the caching is done, we set the cache ready
				243	* bit.
				244	*/
				245	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				246	{
				247	int i;
				248	char *s;
				249	char *d;
				250	int ret;
				251
				252	ret = alloc_rbio_pages(rbio);
				253	if (ret)
				254	return;
				255
				256	for (i = 0; i < rbio->nr_pages; i++) {
				257	if (!rbio->bio_pages[i])
				258	continue;
				259
				260	s = kmap(rbio->bio_pages[i]);
				261	d = kmap(rbio->stripe_pages[i]);
				262
				263	memcpy(d, s, PAGE_CACHE_SIZE);
				264
				265	kunmap(rbio->bio_pages[i]);
				266	kunmap(rbio->stripe_pages[i]);
				267	SetPageUptodate(rbio->stripe_pages[i]);
				268	}
				269	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				270	}
				271
				272	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	273	* we hash on the first logical address of the stripe
				274	*/
				275	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				276	{
				277	u64 num = rbio->raid_map[0];
				278
				279	/*
				280	* we shift down quite a bit. We're using byte
				281	* addressing, and most of the lower bits are zeros.
				282	* This tends to upset hash_64, and it consistently
				283	* returns just one or two different values.
				284	*
				285	* shifting off the lower bits fixes things.
				286	*/
				287	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				288	}
				289
				290	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	291	* stealing an rbio means taking all the uptodate pages from the stripe
				292	* array in the source rbio and putting them into the destination rbio
				293	*/
				294	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				295	{
				296	int i;
				297	struct page *s;
				298	struct page *d;
				299
				300	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				301	return;
				302
				303	for (i = 0; i < dest->nr_pages; i++) {
				304	s = src->stripe_pages[i];
				305	if (!s \|\| !PageUptodate(s)) {
				306	continue;
				307	}
				308
				309	d = dest->stripe_pages[i];
				310	if (d)
				311	__free_page(d);
				312
				313	dest->stripe_pages[i] = s;
				314	src->stripe_pages[i] = NULL;
				315	}
				316	}
				317
				318	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	319	* merging means we take the bio_list from the victim and
				320	* splice it into the destination. The victim should
				321	* be discarded afterwards.
				322	*
				323	* must be called with dest->rbio_list_lock held
				324	*/
				325	static void merge_rbio(struct btrfs_raid_bio *dest,
				326	struct btrfs_raid_bio *victim)
				327	{
				328	bio_list_merge(&dest->bio_list, &victim->bio_list);
				329	dest->bio_list_bytes += victim->bio_list_bytes;
				330	bio_list_init(&victim->bio_list);
				331	}
				332
				333	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	334	* used to prune items that are in the cache. The caller
				335	* must hold the hash table lock.
				336	*/
				337	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				338	{
				339	int bucket = rbio_bucket(rbio);
				340	struct btrfs_stripe_hash_table *table;
				341	struct btrfs_stripe_hash *h;
				342	int freeit = 0;
				343
				344	/*
				345	* check the bit again under the hash table lock.
				346	*/
				347	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				348	return;
				349
				350	table = rbio->fs_info->stripe_hash_table;
				351	h = table->table + bucket;
				352
				353	/* hold the lock for the bucket because we may be
				354	* removing it from the hash table
				355	*/
				356	spin_lock(&h->lock);
				357
				358	/*
				359	* hold the lock for the bio list because we need
				360	* to make sure the bio list is empty
				361	*/
				362	spin_lock(&rbio->bio_list_lock);
				363
				364	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				365	list_del_init(&rbio->stripe_cache);
				366	table->cache_size -= 1;
				367	freeit = 1;
				368
				369	/* if the bio list isn't empty, this rbio is
				370	* still involved in an IO. We take it out
				371	* of the cache list, and drop the ref that
				372	* was held for the list.
				373	*
				374	* If the bio_list was empty, we also remove
				375	* the rbio from the hash_table, and drop
				376	* the corresponding ref
				377	*/
				378	if (bio_list_empty(&rbio->bio_list)) {
				379	if (!list_empty(&rbio->hash_list)) {
				380	list_del_init(&rbio->hash_list);
				381	atomic_dec(&rbio->refs);
				382	BUG_ON(!list_empty(&rbio->plug_list));
				383	}
				384	}
				385	}
				386
				387	spin_unlock(&rbio->bio_list_lock);
				388	spin_unlock(&h->lock);
				389
				390	if (freeit)
				391	__free_raid_bio(rbio);
				392	}
				393
				394	/*
				395	* prune a given rbio from the cache
				396	*/
				397	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				398	{
				399	struct btrfs_stripe_hash_table *table;
				400	unsigned long flags;
				401
				402	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				403	return;
				404
				405	table = rbio->fs_info->stripe_hash_table;
				406
				407	spin_lock_irqsave(&table->cache_lock, flags);
				408	__remove_rbio_from_cache(rbio);
				409	spin_unlock_irqrestore(&table->cache_lock, flags);
				410	}
				411
				412	/*
				413	* remove everything in the cache
				414	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	415	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	416	{
				417	struct btrfs_stripe_hash_table *table;
				418	unsigned long flags;
				419	struct btrfs_raid_bio *rbio;
				420
				421	table = info->stripe_hash_table;
				422
				423	spin_lock_irqsave(&table->cache_lock, flags);
				424	while (!list_empty(&table->stripe_cache)) {
				425	rbio = list_entry(table->stripe_cache.next,
				426	struct btrfs_raid_bio,
				427	stripe_cache);
				428	__remove_rbio_from_cache(rbio);
				429	}
				430	spin_unlock_irqrestore(&table->cache_lock, flags);
				431	}
				432
				433	/*
				434	* remove all cached entries and free the hash table
				435	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	436	*/
				437	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				438	{
				439	if (!info->stripe_hash_table)
				440	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	441	btrfs_clear_rbio_cache(info);
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	442	if (is_vmalloc_addr(info->stripe_hash_table))
				443	vfree(info->stripe_hash_table);
				444	else
				445	kfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	446	info->stripe_hash_table = NULL;
				447	}
				448
				449	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	450	* insert an rbio into the stripe cache. It
				451	* must have already been prepared by calling
				452	* cache_rbio_pages
				453	*
				454	* If this rbio was already cached, it gets
				455	* moved to the front of the lru.
				456	*
				457	* If the size of the rbio cache is too big, we
				458	* prune an item.
				459	*/
				460	static void cache_rbio(struct btrfs_raid_bio *rbio)
				461	{
				462	struct btrfs_stripe_hash_table *table;
				463	unsigned long flags;
				464
				465	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				466	return;
				467
				468	table = rbio->fs_info->stripe_hash_table;
				469
				470	spin_lock_irqsave(&table->cache_lock, flags);
				471	spin_lock(&rbio->bio_list_lock);
				472
				473	/* bump our ref if we were not in the list before */
				474	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				475	atomic_inc(&rbio->refs);
				476
				477	if (!list_empty(&rbio->stripe_cache)){
				478	list_move(&rbio->stripe_cache, &table->stripe_cache);
				479	} else {
				480	list_add(&rbio->stripe_cache, &table->stripe_cache);
				481	table->cache_size += 1;
				482	}
				483
				484	spin_unlock(&rbio->bio_list_lock);
				485
				486	if (table->cache_size > RBIO_CACHE_SIZE) {
				487	struct btrfs_raid_bio *found;
				488
				489	found = list_entry(table->stripe_cache.prev,
				490	struct btrfs_raid_bio,
				491	stripe_cache);
				492
				493	if (found != rbio)
				494	__remove_rbio_from_cache(found);
				495	}
				496
				497	spin_unlock_irqrestore(&table->cache_lock, flags);
				498	return;
				499	}
				500
				501	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	502	* helper function to run the xor_blocks api. It is only
				503	* able to do MAX_XOR_BLOCKS at a time, so we need to
				504	* loop through.
				505	*/
				506	static void run_xor(void **pages, int src_cnt, ssize_t len)
				507	{
				508	int src_off = 0;
				509	int xor_src_cnt = 0;
				510	void *dest = pages[src_cnt];
				511
				512	while(src_cnt > 0) {
				513	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				514	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				515
				516	src_cnt -= xor_src_cnt;
				517	src_off += xor_src_cnt;
				518	}
				519	}
				520
				521	/*
				522	* returns true if the bio list inside this rbio
				523	* covers an entire stripe (no rmw required).
				524	* Must be called with the bio list lock held, or
				525	* at a time when you know it is impossible to add
				526	* new bios into the list
				527	*/
				528	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				529	{
				530	unsigned long size = rbio->bio_list_bytes;
				531	int ret = 1;
				532
				533	if (size != rbio->nr_data * rbio->stripe_len)
				534	ret = 0;
				535
				536	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				537	return ret;
				538	}
				539
				540	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				541	{
				542	unsigned long flags;
				543	int ret;
				544
				545	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				546	ret = __rbio_is_full(rbio);
				547	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				548	return ret;
				549	}
				550
				551	/*
				552	* returns 1 if it is safe to merge two rbios together.
				553	* The merging is safe if the two rbios correspond to
				554	* the same stripe and if they are both going in the same
				555	* direction (read vs write), and if neither one is
				556	* locked for final IO
				557	*
				558	* The caller is responsible for locking such that
				559	* rmw_locked is safe to test
				560	*/
				561	static int rbio_can_merge(struct btrfs_raid_bio *last,
				562	struct btrfs_raid_bio *cur)
				563	{
				564	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				565	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				566	return 0;
				567
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	568	/*
				569	* we can't merge with cached rbios, since the
				570	* idea is that when we merge the destination
				571	* rbio is going to run our IO for us. We can
				572	* steal from cached rbio's though, other functions
				573	* handle that.
				574	*/
				575	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				576	test_bit(RBIO_CACHE_BIT, &cur->flags))
				577	return 0;
				578
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	579	if (last->raid_map[0] !=
				580	cur->raid_map[0])
				581	return 0;
				582
				583	/* reads can't merge with writes */
				584	if (last->read_rebuild !=
				585	cur->read_rebuild) {
				586	return 0;
				587	}
				588
				589	return 1;
				590	}
				591
				592	/*
				593	* helper to index into the pstripe
				594	*/
				595	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				596	{
				597	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				598	return rbio->stripe_pages[index];
				599	}
				600
				601	/*
				602	* helper to index into the qstripe, returns null
				603	* if there is no qstripe
				604	*/
				605	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				606	{
				607	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				608	return NULL;
				609
				610	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				611	PAGE_CACHE_SHIFT;
				612	return rbio->stripe_pages[index];
				613	}
				614
				615	/*
				616	* The first stripe in the table for a logical address
				617	* has the lock. rbios are added in one of three ways:
				618	*
				619	* 1) Nobody has the stripe locked yet. The rbio is given
				620	* the lock and 0 is returned. The caller must start the IO
				621	* themselves.
				622	*
				623	* 2) Someone has the stripe locked, but we're able to merge
				624	* with the lock owner. The rbio is freed and the IO will
				625	* start automatically along with the existing rbio. 1 is returned.
				626	*
				627	* 3) Someone has the stripe locked, but we're not able to merge.
				628	* The rbio is added to the lock owner's plug list, or merged into
				629	* an rbio already on the plug list. When the lock owner unlocks,
				630	* the next rbio on the list is run and the IO is started automatically.
				631	* 1 is returned
				632	*
				633	* If we return 0, the caller still owns the rbio and must continue with
				634	* IO submission. If we return 1, the caller must assume the rbio has
				635	* already been freed.
				636	*/
				637	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				638	{
				639	int bucket = rbio_bucket(rbio);
				640	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				641	struct btrfs_raid_bio *cur;
				642	struct btrfs_raid_bio *pending;
				643	unsigned long flags;
				644	DEFINE_WAIT(wait);
				645	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	646	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	647	int ret = 0;
				648	int walk = 0;
				649
				650	spin_lock_irqsave(&h->lock, flags);
				651	list_for_each_entry(cur, &h->hash_list, hash_list) {
				652	walk++;
				653	if (cur->raid_map[0] == rbio->raid_map[0]) {
				654	spin_lock(&cur->bio_list_lock);
				655
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	656	/* can we steal this cached rbio's pages? */
				657	if (bio_list_empty(&cur->bio_list) &&
				658	list_empty(&cur->plug_list) &&
				659	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				660	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				661	list_del_init(&cur->hash_list);
				662	atomic_dec(&cur->refs);
				663
				664	steal_rbio(cur, rbio);
				665	cache_drop = cur;
				666	spin_unlock(&cur->bio_list_lock);
				667
				668	goto lockit;
				669	}
				670
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	671	/* can we merge into the lock owner? */
				672	if (rbio_can_merge(cur, rbio)) {
				673	merge_rbio(cur, rbio);
				674	spin_unlock(&cur->bio_list_lock);
				675	freeit = rbio;
				676	ret = 1;
				677	goto out;
				678	}
				679
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	680
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	681	/*
				682	* we couldn't merge with the running
				683	* rbio, see if we can merge with the
				684	* pending ones. We don't have to
				685	* check for rmw_locked because there
				686	* is no way they are inside finish_rmw
				687	* right now
				688	*/
				689	list_for_each_entry(pending, &cur->plug_list,
				690	plug_list) {
				691	if (rbio_can_merge(pending, rbio)) {
				692	merge_rbio(pending, rbio);
				693	spin_unlock(&cur->bio_list_lock);
				694	freeit = rbio;
				695	ret = 1;
				696	goto out;
				697	}
				698	}
				699
				700	/* no merging, put us on the tail of the plug list,
				701	* our rbio will be started with the currently
				702	* running rbio unlocks
				703	*/
				704	list_add_tail(&rbio->plug_list, &cur->plug_list);
				705	spin_unlock(&cur->bio_list_lock);
				706	ret = 1;
				707	goto out;
				708	}
				709	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	710	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	711	atomic_inc(&rbio->refs);
				712	list_add(&rbio->hash_list, &h->hash_list);
				713	out:
				714	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	715	if (cache_drop)
				716	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	717	if (freeit)
				718	__free_raid_bio(freeit);
				719	return ret;
				720	}
				721
				722	/*
				723	* called as rmw or parity rebuild is completed. If the plug list has more
				724	* rbios waiting for this stripe, the next one on the list will be started
				725	*/
				726	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				727	{
				728	int bucket;
				729	struct btrfs_stripe_hash *h;
				730	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	731	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	732
				733	bucket = rbio_bucket(rbio);
				734	h = rbio->fs_info->stripe_hash_table->table + bucket;
				735
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	736	if (list_empty(&rbio->plug_list))
				737	cache_rbio(rbio);
				738
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	739	spin_lock_irqsave(&h->lock, flags);
				740	spin_lock(&rbio->bio_list_lock);
				741
				742	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	743	/*
				744	* if we're still cached and there is no other IO
				745	* to perform, just leave this rbio here for others
				746	* to steal from later
				747	*/
				748	if (list_empty(&rbio->plug_list) &&
				749	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				750	keep_cache = 1;
				751	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				752	BUG_ON(!bio_list_empty(&rbio->bio_list));
				753	goto done;
				754	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	755
				756	list_del_init(&rbio->hash_list);
				757	atomic_dec(&rbio->refs);
				758
				759	/*
				760	* we use the plug list to hold all the rbios
				761	* waiting for the chance to lock this stripe.
				762	* hand the lock over to one of them.
				763	*/
				764	if (!list_empty(&rbio->plug_list)) {
				765	struct btrfs_raid_bio *next;
				766	struct list_head *head = rbio->plug_list.next;
				767
				768	next = list_entry(head, struct btrfs_raid_bio,
				769	plug_list);
				770
				771	list_del_init(&rbio->plug_list);
				772
				773	list_add(&next->hash_list, &h->hash_list);
				774	atomic_inc(&next->refs);
				775	spin_unlock(&rbio->bio_list_lock);
				776	spin_unlock_irqrestore(&h->lock, flags);
				777
				778	if (next->read_rebuild)
				779	async_read_rebuild(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	780	else {
				781	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	782	async_rmw_stripe(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	783	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	784
				785	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	786	} else if (waitqueue_active(&h->wait)) {
				787	spin_unlock(&rbio->bio_list_lock);
				788	spin_unlock_irqrestore(&h->lock, flags);
				789	wake_up(&h->wait);
				790	goto done_nolock;
				791	}
				792	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	793	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	794	spin_unlock(&rbio->bio_list_lock);
				795	spin_unlock_irqrestore(&h->lock, flags);
				796
				797	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	798	if (!keep_cache)
				799	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	800	}
				801
				802	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				803	{
				804	int i;
				805
				806	WARN_ON(atomic_read(&rbio->refs) < 0);
				807	if (!atomic_dec_and_test(&rbio->refs))
				808	return;
				809
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	810	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	811	WARN_ON(!list_empty(&rbio->hash_list));
				812	WARN_ON(!bio_list_empty(&rbio->bio_list));
				813
				814	for (i = 0; i < rbio->nr_pages; i++) {
				815	if (rbio->stripe_pages[i]) {
				816	__free_page(rbio->stripe_pages[i]);
				817	rbio->stripe_pages[i] = NULL;
				818	}
				819	}
				820	kfree(rbio->raid_map);
				821	kfree(rbio->bbio);
				822	kfree(rbio);
				823	}
				824
				825	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				826	{
				827	unlock_stripe(rbio);
				828	__free_raid_bio(rbio);
				829	}
				830
				831	/*
				832	* this frees the rbio and runs through all the bios in the
				833	* bio_list and calls end_io on them
				834	*/
				835	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				836	{
				837	struct bio *cur = bio_list_get(&rbio->bio_list);
				838	struct bio *next;
				839	free_raid_bio(rbio);
				840
				841	while (cur) {
				842	next = cur->bi_next;
				843	cur->bi_next = NULL;
				844	if (uptodate)
				845	set_bit(BIO_UPTODATE, &cur->bi_flags);
				846	bio_endio(cur, err);
				847	cur = next;
				848	}
				849	}
				850
				851	/*
				852	* end io function used by finish_rmw. When we finally
				853	* get here, we've written a full stripe
				854	*/
				855	static void raid_write_end_io(struct bio *bio, int err)
				856	{
				857	struct btrfs_raid_bio *rbio = bio->bi_private;
				858
				859	if (err)
				860	fail_bio_stripe(rbio, bio);
				861
				862	bio_put(bio);
				863
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	864	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	865	return;
				866
				867	err = 0;
				868
				869	/* OK, we have read all the stripes we need to. */
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	870	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	871	err = -EIO;
				872
				873	rbio_orig_end_io(rbio, err, 0);
				874	return;
				875	}
				876
				877	/*
				878	* the read/modify/write code wants to use the original bio for
				879	* any pages it included, and then use the rbio for everything
				880	* else. This function decides if a given index (stripe number)
				881	* and page number in that stripe fall inside the original bio
				882	* or the rbio.
				883	*
				884	* if you set bio_list_only, you'll get a NULL back for any ranges
				885	* that are outside the bio_list
				886	*
				887	* This doesn't take any refs on anything, you get a bare page pointer
				888	* and the caller must bump refs as required.
				889	*
				890	* You must call index_rbio_pages once before you can trust
				891	* the answers from this function.
				892	*/
				893	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				894	int index, int pagenr, int bio_list_only)
				895	{
				896	int chunk_page;
				897	struct page *p = NULL;
				898
				899	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				900
				901	spin_lock_irq(&rbio->bio_list_lock);
				902	p = rbio->bio_pages[chunk_page];
				903	spin_unlock_irq(&rbio->bio_list_lock);
				904
				905	if (p \|\| bio_list_only)
				906	return p;
				907
				908	return rbio->stripe_pages[chunk_page];
				909	}
				910
				911	/*
				912	* number of pages we need for the entire stripe across all the
				913	* drives
				914	*/
				915	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				916	{
				917	unsigned long nr = stripe_len * nr_stripes;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	918	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	919	}
				920
				921	/*
				922	* allocation and initial setup for the btrfs_raid_bio. Not
				923	* this does not allocate any pages for rbio->pages.
				924	*/
				925	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				926	struct btrfs_bio bbio, u64 raid_map,
				927	u64 stripe_len)
				928	{
				929	struct btrfs_raid_bio *rbio;
				930	int nr_data = 0;
				931	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
				932	void *p;
				933
				934	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2,
				935	GFP_NOFS);
				936	if (!rbio) {
				937	kfree(raid_map);
				938	kfree(bbio);
				939	return ERR_PTR(-ENOMEM);
				940	}
				941
				942	bio_list_init(&rbio->bio_list);
				943	INIT_LIST_HEAD(&rbio->plug_list);
				944	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	945	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	946	INIT_LIST_HEAD(&rbio->hash_list);
				947	rbio->bbio = bbio;
				948	rbio->raid_map = raid_map;
				949	rbio->fs_info = root->fs_info;
				950	rbio->stripe_len = stripe_len;
				951	rbio->nr_pages = num_pages;
				952	rbio->faila = -1;
				953	rbio->failb = -1;
				954	atomic_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	955	atomic_set(&rbio->error, 0);
				956	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	957
				958	/*
				959	* the stripe_pages and bio_pages array point to the extra
				960	* memory we allocated past the end of the rbio
				961	*/
				962	p = rbio + 1;
				963	rbio->stripe_pages = p;
				964	rbio->bio_pages = p + sizeof(struct page ) num_pages;
				965
				966	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				967	nr_data = bbio->num_stripes - 2;
				968	else
				969	nr_data = bbio->num_stripes - 1;
				970
				971	rbio->nr_data = nr_data;
				972	return rbio;
				973	}
				974
				975	/* allocate pages for all the stripes in the bio, including parity */
				976	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				977	{
				978	int i;
				979	struct page *page;
				980
				981	for (i = 0; i < rbio->nr_pages; i++) {
				982	if (rbio->stripe_pages[i])
				983	continue;
				984	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				985	if (!page)
				986	return -ENOMEM;
				987	rbio->stripe_pages[i] = page;
				988	ClearPageUptodate(page);
				989	}
				990	return 0;
				991	}
				992
				993	/* allocate pages for just the p/q stripes */
				994	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				995	{
				996	int i;
				997	struct page *page;
				998
				999	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				1000
				1001	for (; i < rbio->nr_pages; i++) {
				1002	if (rbio->stripe_pages[i])
				1003	continue;
				1004	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1005	if (!page)
				1006	return -ENOMEM;
				1007	rbio->stripe_pages[i] = page;
				1008	}
				1009	return 0;
				1010	}
				1011
				1012	/*
				1013	* add a single page from a specific stripe into our list of bios for IO
				1014	* this will try to merge into existing bios if possible, and returns
				1015	* zero if all went well.
				1016	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1017	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1018	struct bio_list *bio_list,
				1019	struct page *page,
				1020	int stripe_nr,
				1021	unsigned long page_index,
				1022	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1023	{
				1024	struct bio *last = bio_list->tail;
				1025	u64 last_end = 0;
				1026	int ret;
				1027	struct bio *bio;
				1028	struct btrfs_bio_stripe *stripe;
				1029	u64 disk_start;
				1030
				1031	stripe = &rbio->bbio->stripes[stripe_nr];
				1032	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1033
				1034	/* if the device is missing, just fail this stripe */
				1035	if (!stripe->dev->bdev)
				1036	return fail_rbio_index(rbio, stripe_nr);
				1037
				1038	/* see if we can add this page onto our existing bio */
				1039	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1040	last_end = (u64)last->bi_iter.bi_sector << 9;
				1041	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1042
				1043	/*
				1044	* we can't merge these if they are from different
				1045	* devices or if they are not contiguous
				1046	*/
				1047	if (last_end == disk_start && stripe->dev->bdev &&
				1048	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1049	last->bi_bdev == stripe->dev->bdev) {
				1050	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1051	if (ret == PAGE_CACHE_SIZE)
				1052	return 0;
				1053	}
				1054	}
				1055
				1056	/* put a new bio on the list */
Chris Mason	9be3395	2013-05-17 18:30:14 -0400	[diff] [blame]	1057	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1058	if (!bio)
				1059	return -ENOMEM;
				1060
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1061	bio->bi_iter.bi_size = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1062	bio->bi_bdev = stripe->dev->bdev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1063	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1064	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1065
				1066	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1067	bio_list_add(bio_list, bio);
				1068	return 0;
				1069	}
				1070
				1071	/*
				1072	* while we're doing the read/modify/write cycle, we could
				1073	* have errors in reading pages off the disk. This checks
				1074	* for errors and if we're not able to read the page it'll
				1075	* trigger parity reconstruction. The rmw will be finished
				1076	* after we've reconstructed the failed stripes
				1077	*/
				1078	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1079	{
				1080	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				1081	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				1082	__raid56_parity_recover(rbio);
				1083	} else {
				1084	finish_rmw(rbio);
				1085	}
				1086	}
				1087
				1088	/*
				1089	* these are just the pages from the rbio array, not from anything
				1090	* the FS sent down to us
				1091	*/
				1092	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1093	{
				1094	int index;
				1095	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1096	index += page;
				1097	return rbio->stripe_pages[index];
				1098	}
				1099
				1100	/*
				1101	* helper function to walk our bio list and populate the bio_pages array with
				1102	* the result. This seems expensive, but it is faster than constantly
				1103	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1104	* reconstruction.
				1105	*
				1106	* This must be called before you trust the answers from page_in_rbio
				1107	*/
				1108	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1109	{
				1110	struct bio *bio;
				1111	u64 start;
				1112	unsigned long stripe_offset;
				1113	unsigned long page_index;
				1114	struct page *p;
				1115	int i;
				1116
				1117	spin_lock_irq(&rbio->bio_list_lock);
				1118	bio_list_for_each(bio, &rbio->bio_list) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1119	start = (u64)bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1120	stripe_offset = start - rbio->raid_map[0];
				1121	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1122
				1123	for (i = 0; i < bio->bi_vcnt; i++) {
				1124	p = bio->bi_io_vec[i].bv_page;
				1125	rbio->bio_pages[page_index + i] = p;
				1126	}
				1127	}
				1128	spin_unlock_irq(&rbio->bio_list_lock);
				1129	}
				1130
				1131	/*
				1132	* this is called from one of two situations. We either
				1133	* have a full stripe from the higher layers, or we've read all
				1134	* the missing bits off disk.
				1135	*
				1136	* This will calculate the parity and then send down any
				1137	* changed blocks.
				1138	*/
				1139	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1140	{
				1141	struct btrfs_bio *bbio = rbio->bbio;
				1142	void *pointers[bbio->num_stripes];
				1143	int stripe_len = rbio->stripe_len;
				1144	int nr_data = rbio->nr_data;
				1145	int stripe;
				1146	int pagenr;
				1147	int p_stripe = -1;
				1148	int q_stripe = -1;
				1149	struct bio_list bio_list;
				1150	struct bio *bio;
				1151	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1152	int ret;
				1153
				1154	bio_list_init(&bio_list);
				1155
				1156	if (bbio->num_stripes - rbio->nr_data == 1) {
				1157	p_stripe = bbio->num_stripes - 1;
				1158	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				1159	p_stripe = bbio->num_stripes - 2;
				1160	q_stripe = bbio->num_stripes - 1;
				1161	} else {
				1162	BUG();
				1163	}
				1164
				1165	/* at this point we either have a full stripe,
				1166	* or we've read the full stripe from the drive.
				1167	* recalculate the parity and write the new results.
				1168	*
				1169	* We're not allowed to add any new bios to the
				1170	* bio list here, anyone else that wants to
				1171	* change this stripe needs to do their own rmw.
				1172	*/
				1173	spin_lock_irq(&rbio->bio_list_lock);
				1174	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1175	spin_unlock_irq(&rbio->bio_list_lock);
				1176
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1177	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1178
				1179	/*
				1180	* now that we've set rmw_locked, run through the
				1181	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1182	*
				1183	* We don't cache full rbios because we're assuming
				1184	* the higher layers are unlikely to use this area of
				1185	* the disk again soon. If they do use it again,
				1186	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1187	*/
				1188	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1189	if (!rbio_is_full(rbio))
				1190	cache_rbio_pages(rbio);
				1191	else
				1192	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1193
				1194	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1195	struct page *p;
				1196	/* first collect one page from each data stripe */
				1197	for (stripe = 0; stripe < nr_data; stripe++) {
				1198	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1199	pointers[stripe] = kmap(p);
				1200	}
				1201
				1202	/* then add the parity stripe */
				1203	p = rbio_pstripe_page(rbio, pagenr);
				1204	SetPageUptodate(p);
				1205	pointers[stripe++] = kmap(p);
				1206
				1207	if (q_stripe != -1) {
				1208
				1209	/*
				1210	* raid6, add the qstripe and call the
				1211	* library function to fill in our p/q
				1212	*/
				1213	p = rbio_qstripe_page(rbio, pagenr);
				1214	SetPageUptodate(p);
				1215	pointers[stripe++] = kmap(p);
				1216
				1217	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				1218	pointers);
				1219	} else {
				1220	/* raid5 */
				1221	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1222	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1223	}
				1224
				1225
				1226	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				1227	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1228	}
				1229
				1230	/*
				1231	* time to start writing. Make bios for everything from the
				1232	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1233	* everything else.
				1234	*/
				1235	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1236	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1237	struct page *page;
				1238	if (stripe < rbio->nr_data) {
				1239	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1240	if (!page)
				1241	continue;
				1242	} else {
				1243	page = rbio_stripe_page(rbio, stripe, pagenr);
				1244	}
				1245
				1246	ret = rbio_add_io_page(rbio, &bio_list,
				1247	page, stripe, pagenr, rbio->stripe_len);
				1248	if (ret)
				1249	goto cleanup;
				1250	}
				1251	}
				1252
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1253	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1254	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1255
				1256	while (1) {
				1257	bio = bio_list_pop(&bio_list);
				1258	if (!bio)
				1259	break;
				1260
				1261	bio->bi_private = rbio;
				1262	bio->bi_end_io = raid_write_end_io;
				1263	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1264	submit_bio(WRITE, bio);
				1265	}
				1266	return;
				1267
				1268	cleanup:
				1269	rbio_orig_end_io(rbio, -EIO, 0);
				1270	}
				1271
				1272	/*
				1273	* helper to find the stripe number for a given bio. Used to figure out which
				1274	* stripe has failed. This expects the bio to correspond to a physical disk,
				1275	* so it looks up based on physical sector numbers.
				1276	*/
				1277	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1278	struct bio *bio)
				1279	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1280	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1281	u64 stripe_start;
				1282	int i;
				1283	struct btrfs_bio_stripe *stripe;
				1284
				1285	physical <<= 9;
				1286
				1287	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1288	stripe = &rbio->bbio->stripes[i];
				1289	stripe_start = stripe->physical;
				1290	if (physical >= stripe_start &&
				1291	physical < stripe_start + rbio->stripe_len) {
				1292	return i;
				1293	}
				1294	}
				1295	return -1;
				1296	}
				1297
				1298	/*
				1299	* helper to find the stripe number for a given
				1300	* bio (before mapping). Used to figure out which stripe has
				1301	* failed. This looks up based on logical block numbers.
				1302	*/
				1303	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1304	struct bio *bio)
				1305	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1306	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1307	u64 stripe_start;
				1308	int i;
				1309
				1310	logical <<= 9;
				1311
				1312	for (i = 0; i < rbio->nr_data; i++) {
				1313	stripe_start = rbio->raid_map[i];
				1314	if (logical >= stripe_start &&
				1315	logical < stripe_start + rbio->stripe_len) {
				1316	return i;
				1317	}
				1318	}
				1319	return -1;
				1320	}
				1321
				1322	/*
				1323	* returns -EIO if we had too many failures
				1324	*/
				1325	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1326	{
				1327	unsigned long flags;
				1328	int ret = 0;
				1329
				1330	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1331
				1332	/* we already know this stripe is bad, move on */
				1333	if (rbio->faila == failed \|\| rbio->failb == failed)
				1334	goto out;
				1335
				1336	if (rbio->faila == -1) {
				1337	/* first failure on this rbio */
				1338	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1339	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1340	} else if (rbio->failb == -1) {
				1341	/* second failure on this rbio */
				1342	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1343	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1344	} else {
				1345	ret = -EIO;
				1346	}
				1347	out:
				1348	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1349
				1350	return ret;
				1351	}
				1352
				1353	/*
				1354	* helper to fail a stripe based on a physical disk
				1355	* bio.
				1356	*/
				1357	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1358	struct bio *bio)
				1359	{
				1360	int failed = find_bio_stripe(rbio, bio);
				1361
				1362	if (failed < 0)
				1363	return -EIO;
				1364
				1365	return fail_rbio_index(rbio, failed);
				1366	}
				1367
				1368	/*
				1369	* this sets each page in the bio uptodate. It should only be used on private
				1370	* rbio pages, nothing that comes in from the higher layers
				1371	*/
				1372	static void set_bio_pages_uptodate(struct bio *bio)
				1373	{
				1374	int i;
				1375	struct page *p;
				1376
				1377	for (i = 0; i < bio->bi_vcnt; i++) {
				1378	p = bio->bi_io_vec[i].bv_page;
				1379	SetPageUptodate(p);
				1380	}
				1381	}
				1382
				1383	/*
				1384	* end io for the read phase of the rmw cycle. All the bios here are physical
				1385	* stripe bios we've read from the disk so we can recalculate the parity of the
				1386	* stripe.
				1387	*
				1388	* This will usually kick off finish_rmw once all the bios are read in, but it
				1389	* may trigger parity reconstruction if we had any errors along the way
				1390	*/
				1391	static void raid_rmw_end_io(struct bio *bio, int err)
				1392	{
				1393	struct btrfs_raid_bio *rbio = bio->bi_private;
				1394
				1395	if (err)
				1396	fail_bio_stripe(rbio, bio);
				1397	else
				1398	set_bio_pages_uptodate(bio);
				1399
				1400	bio_put(bio);
				1401
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1402	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1403	return;
				1404
				1405	err = 0;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1406	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1407	goto cleanup;
				1408
				1409	/*
				1410	* this will normally call finish_rmw to start our write
				1411	* but if there are any failed stripes we'll reconstruct
				1412	* from parity first
				1413	*/
				1414	validate_rbio_for_rmw(rbio);
				1415	return;
				1416
				1417	cleanup:
				1418
				1419	rbio_orig_end_io(rbio, -EIO, 0);
				1420	}
				1421
				1422	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1423	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1424	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1425	rmw_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1426
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1427	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1428	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1429	}
				1430
				1431	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1432	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1433	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1434	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1435
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1436	btrfs_queue_work(rbio->fs_info->rmw_workers,
				1437	&rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1438	}
				1439
				1440	/*
				1441	* the stripe must be locked by the caller. It will
				1442	* unlock after all the writes are done
				1443	*/
				1444	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1445	{
				1446	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1447	struct bio_list bio_list;
				1448	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1449	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1450	int pagenr;
				1451	int stripe;
				1452	struct bio *bio;
				1453
				1454	bio_list_init(&bio_list);
				1455
				1456	ret = alloc_rbio_pages(rbio);
				1457	if (ret)
				1458	goto cleanup;
				1459
				1460	index_rbio_pages(rbio);
				1461
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1462	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1463	/*
				1464	* build a list of bios to read all the missing parts of this
				1465	* stripe
				1466	*/
				1467	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1468	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1469	struct page *page;
				1470	/*
				1471	* we want to find all the pages missing from
				1472	* the rbio and read them from the disk. If
				1473	* page_in_rbio finds a page in the bio list
				1474	* we don't need to read it off the stripe.
				1475	*/
				1476	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1477	if (page)
				1478	continue;
				1479
				1480	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1481	/*
				1482	* the bio cache may have handed us an uptodate
				1483	* page. If so, be happy and use it
				1484	*/
				1485	if (PageUptodate(page))
				1486	continue;
				1487
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1488	ret = rbio_add_io_page(rbio, &bio_list, page,
				1489	stripe, pagenr, rbio->stripe_len);
				1490	if (ret)
				1491	goto cleanup;
				1492	}
				1493	}
				1494
				1495	bios_to_read = bio_list_size(&bio_list);
				1496	if (!bios_to_read) {
				1497	/*
				1498	* this can happen if others have merged with
				1499	* us, it means there is nothing left to read.
				1500	* But if there are missing devices it may not be
				1501	* safe to do the full stripe write yet.
				1502	*/
				1503	goto finish;
				1504	}
				1505
				1506	/*
				1507	* the bbio may be freed once we submit the last bio. Make sure
				1508	* not to touch it after that
				1509	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1510	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1511	while (1) {
				1512	bio = bio_list_pop(&bio_list);
				1513	if (!bio)
				1514	break;
				1515
				1516	bio->bi_private = rbio;
				1517	bio->bi_end_io = raid_rmw_end_io;
				1518
				1519	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1520	BTRFS_WQ_ENDIO_RAID56);
				1521
				1522	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1523	submit_bio(READ, bio);
				1524	}
				1525	/* the actual write will happen once the reads are done */
				1526	return 0;
				1527
				1528	cleanup:
				1529	rbio_orig_end_io(rbio, -EIO, 0);
				1530	return -EIO;
				1531
				1532	finish:
				1533	validate_rbio_for_rmw(rbio);
				1534	return 0;
				1535	}
				1536
				1537	/*
				1538	* if the upper layers pass in a full stripe, we thank them by only allocating
				1539	* enough pages to hold the parity, and sending it all down quickly.
				1540	*/
				1541	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1542	{
				1543	int ret;
				1544
				1545	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1546	if (ret) {
				1547	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1548	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1549	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1550
				1551	ret = lock_stripe_add(rbio);
				1552	if (ret == 0)
				1553	finish_rmw(rbio);
				1554	return 0;
				1555	}
				1556
				1557	/*
				1558	* partial stripe writes get handed over to async helpers.
				1559	* We're really hoping to merge a few more writes into this
				1560	* rbio before calculating new parity
				1561	*/
				1562	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1563	{
				1564	int ret;
				1565
				1566	ret = lock_stripe_add(rbio);
				1567	if (ret == 0)
				1568	async_rmw_stripe(rbio);
				1569	return 0;
				1570	}
				1571
				1572	/*
				1573	* sometimes while we were reading from the drive to
				1574	* recalculate parity, enough new bios come into create
				1575	* a full stripe. So we do a check here to see if we can
				1576	* go directly to finish_rmw
				1577	*/
				1578	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1579	{
				1580	/* head off into rmw land if we don't have a full stripe */
				1581	if (!rbio_is_full(rbio))
				1582	return partial_stripe_write(rbio);
				1583	return full_stripe_write(rbio);
				1584	}
				1585
				1586	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1587	* We use plugging call backs to collect full stripes.
				1588	* Any time we get a partial stripe write while plugged
				1589	* we collect it into a list. When the unplug comes down,
				1590	* we sort the list by logical block number and merge
				1591	* everything we can into the same rbios
				1592	*/
				1593	struct btrfs_plug_cb {
				1594	struct blk_plug_cb cb;
				1595	struct btrfs_fs_info *info;
				1596	struct list_head rbio_list;
				1597	struct btrfs_work work;
				1598	};
				1599
				1600	/*
				1601	* rbios on the plug list are sorted for easier merging.
				1602	*/
				1603	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1604	{
				1605	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1606	plug_list);
				1607	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1608	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1609	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1610	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1611
				1612	if (a_sector < b_sector)
				1613	return -1;
				1614	if (a_sector > b_sector)
				1615	return 1;
				1616	return 0;
				1617	}
				1618
				1619	static void run_plug(struct btrfs_plug_cb *plug)
				1620	{
				1621	struct btrfs_raid_bio *cur;
				1622	struct btrfs_raid_bio *last = NULL;
				1623
				1624	/*
				1625	* sort our plug list then try to merge
				1626	* everything we can in hopes of creating full
				1627	* stripes.
				1628	*/
				1629	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1630	while (!list_empty(&plug->rbio_list)) {
				1631	cur = list_entry(plug->rbio_list.next,
				1632	struct btrfs_raid_bio, plug_list);
				1633	list_del_init(&cur->plug_list);
				1634
				1635	if (rbio_is_full(cur)) {
				1636	/* we have a full stripe, send it down */
				1637	full_stripe_write(cur);
				1638	continue;
				1639	}
				1640	if (last) {
				1641	if (rbio_can_merge(last, cur)) {
				1642	merge_rbio(last, cur);
				1643	__free_raid_bio(cur);
				1644	continue;
				1645
				1646	}
				1647	__raid56_parity_write(last);
				1648	}
				1649	last = cur;
				1650	}
				1651	if (last) {
				1652	__raid56_parity_write(last);
				1653	}
				1654	kfree(plug);
				1655	}
				1656
				1657	/*
				1658	* if the unplug comes from schedule, we have to push the
				1659	* work off to a helper thread
				1660	*/
				1661	static void unplug_work(struct btrfs_work *work)
				1662	{
				1663	struct btrfs_plug_cb *plug;
				1664	plug = container_of(work, struct btrfs_plug_cb, work);
				1665	run_plug(plug);
				1666	}
				1667
				1668	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1669	{
				1670	struct btrfs_plug_cb *plug;
				1671	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1672
				1673	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1674	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1675	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1676	btrfs_queue_work(plug->info->rmw_workers,
				1677	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1678	return;
				1679	}
				1680	run_plug(plug);
				1681	}
				1682
				1683	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1684	* our main entry point for writes from the rest of the FS.
				1685	*/
				1686	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1687	struct btrfs_bio bbio, u64 raid_map,
				1688	u64 stripe_len)
				1689	{
				1690	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1691	struct btrfs_plug_cb *plug = NULL;
				1692	struct blk_plug_cb *cb;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1693
				1694	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Dan Carpenter	3dc0e81	2013-07-22 09:55:15 +0300	[diff] [blame]	1695	if (IS_ERR(rbio))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1696	return PTR_ERR(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1697	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1698	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1699
				1700	/*
				1701	* don't plug on full rbios, just get them out the door
				1702	* as quickly as we can
				1703	*/
				1704	if (rbio_is_full(rbio))
				1705	return full_stripe_write(rbio);
				1706
				1707	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1708	sizeof(*plug));
				1709	if (cb) {
				1710	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1711	if (!plug->info) {
				1712	plug->info = root->fs_info;
				1713	INIT_LIST_HEAD(&plug->rbio_list);
				1714	}
				1715	list_add_tail(&rbio->plug_list, &plug->rbio_list);
				1716	} else {
				1717	return __raid56_parity_write(rbio);
				1718	}
				1719	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1720	}
				1721
				1722	/*
				1723	* all parity reconstruction happens here. We've read in everything
				1724	* we can find from the drives and this does the heavy lifting of
				1725	* sorting the good from the bad.
				1726	*/
				1727	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1728	{
				1729	int pagenr, stripe;
				1730	void **pointers;
				1731	int faila = -1, failb = -1;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1732	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1733	struct page *page;
				1734	int err;
				1735	int i;
				1736
				1737	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1738	GFP_NOFS);
				1739	if (!pointers) {
				1740	err = -ENOMEM;
				1741	goto cleanup_io;
				1742	}
				1743
				1744	faila = rbio->faila;
				1745	failb = rbio->failb;
				1746
				1747	if (rbio->read_rebuild) {
				1748	spin_lock_irq(&rbio->bio_list_lock);
				1749	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1750	spin_unlock_irq(&rbio->bio_list_lock);
				1751	}
				1752
				1753	index_rbio_pages(rbio);
				1754
				1755	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1756	/* setup our array of pointers with pages
				1757	* from each stripe
				1758	*/
				1759	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1760	/*
				1761	* if we're rebuilding a read, we have to use
				1762	* pages from the bio list
				1763	*/
				1764	if (rbio->read_rebuild &&
				1765	(stripe == faila \|\| stripe == failb)) {
				1766	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1767	} else {
				1768	page = rbio_stripe_page(rbio, stripe, pagenr);
				1769	}
				1770	pointers[stripe] = kmap(page);
				1771	}
				1772
				1773	/* all raid6 handling here */
				1774	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1775	RAID6_Q_STRIPE) {
				1776
				1777	/*
				1778	* single failure, rebuild from parity raid5
				1779	* style
				1780	*/
				1781	if (failb < 0) {
				1782	if (faila == rbio->nr_data) {
				1783	/*
				1784	* Just the P stripe has failed, without
				1785	* a bad data or Q stripe.
				1786	* TODO, we should redo the xor here.
				1787	*/
				1788	err = -EIO;
				1789	goto cleanup;
				1790	}
				1791	/*
				1792	* a single failure in raid6 is rebuilt
				1793	* in the pstripe code below
				1794	*/
				1795	goto pstripe;
				1796	}
				1797
				1798	/* make sure our ps and qs are in order */
				1799	if (faila > failb) {
				1800	int tmp = failb;
				1801	failb = faila;
				1802	faila = tmp;
				1803	}
				1804
				1805	/* if the q stripe is failed, do a pstripe reconstruction
				1806	* from the xors.
				1807	* If both the q stripe and the P stripe are failed, we're
				1808	* here due to a crc mismatch and we can't give them the
				1809	* data they want
				1810	*/
				1811	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1812	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1813	err = -EIO;
				1814	goto cleanup;
				1815	}
				1816	/*
				1817	* otherwise we have one bad data stripe and
				1818	* a good P stripe. raid5!
				1819	*/
				1820	goto pstripe;
				1821	}
				1822
				1823	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1824	raid6_datap_recov(rbio->bbio->num_stripes,
				1825	PAGE_SIZE, faila, pointers);
				1826	} else {
				1827	raid6_2data_recov(rbio->bbio->num_stripes,
				1828	PAGE_SIZE, faila, failb,
				1829	pointers);
				1830	}
				1831	} else {
				1832	void *p;
				1833
				1834	/* rebuild from P stripe here (raid5 or raid6) */
				1835	BUG_ON(failb != -1);
				1836	pstripe:
				1837	/* Copy parity block into failed block to start with */
				1838	memcpy(pointers[faila],
				1839	pointers[rbio->nr_data],
				1840	PAGE_CACHE_SIZE);
				1841
				1842	/* rearrange the pointer array */
				1843	p = pointers[faila];
				1844	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1845	pointers[stripe] = pointers[stripe + 1];
				1846	pointers[rbio->nr_data - 1] = p;
				1847
				1848	/* xor in the rest */
				1849	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1850	}
				1851	/* if we're doing this rebuild as part of an rmw, go through
				1852	* and set all of our private rbio pages in the
				1853	* failed stripes as uptodate. This way finish_rmw will
				1854	* know they can be trusted. If this was a read reconstruction,
				1855	* other endio functions will fiddle the uptodate bits
				1856	*/
				1857	if (!rbio->read_rebuild) {
				1858	for (i = 0; i < nr_pages; i++) {
				1859	if (faila != -1) {
				1860	page = rbio_stripe_page(rbio, faila, i);
				1861	SetPageUptodate(page);
				1862	}
				1863	if (failb != -1) {
				1864	page = rbio_stripe_page(rbio, failb, i);
				1865	SetPageUptodate(page);
				1866	}
				1867	}
				1868	}
				1869	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1870	/*
				1871	* if we're rebuilding a read, we have to use
				1872	* pages from the bio list
				1873	*/
				1874	if (rbio->read_rebuild &&
				1875	(stripe == faila \|\| stripe == failb)) {
				1876	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1877	} else {
				1878	page = rbio_stripe_page(rbio, stripe, pagenr);
				1879	}
				1880	kunmap(page);
				1881	}
				1882	}
				1883
				1884	err = 0;
				1885	cleanup:
				1886	kfree(pointers);
				1887
				1888	cleanup_io:
				1889
				1890	if (rbio->read_rebuild) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1891	if (err == 0)
				1892	cache_rbio_pages(rbio);
				1893	else
				1894	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1895
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1896	rbio_orig_end_io(rbio, err, err == 0);
				1897	} else if (err == 0) {
				1898	rbio->faila = -1;
				1899	rbio->failb = -1;
				1900	finish_rmw(rbio);
				1901	} else {
				1902	rbio_orig_end_io(rbio, err, 0);
				1903	}
				1904	}
				1905
				1906	/*
				1907	* This is called only for stripes we've read from disk to
				1908	* reconstruct the parity.
				1909	*/
				1910	static void raid_recover_end_io(struct bio *bio, int err)
				1911	{
				1912	struct btrfs_raid_bio *rbio = bio->bi_private;
				1913
				1914	/*
				1915	* we only read stripe pages off the disk, set them
				1916	* up to date if there were no errors
				1917	*/
				1918	if (err)
				1919	fail_bio_stripe(rbio, bio);
				1920	else
				1921	set_bio_pages_uptodate(bio);
				1922	bio_put(bio);
				1923
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1924	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1925	return;
				1926
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1927	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1928	rbio_orig_end_io(rbio, -EIO, 0);
				1929	else
				1930	__raid_recover_end_io(rbio);
				1931	}
				1932
				1933	/*
				1934	* reads everything we need off the disk to reconstruct
				1935	* the parity. endio handlers trigger final reconstruction
				1936	* when the IO is done.
				1937	*
				1938	* This is used both for reads from the higher layers and for
				1939	* parity construction required to finish a rmw cycle.
				1940	*/
				1941	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				1942	{
				1943	int bios_to_read = 0;
				1944	struct btrfs_bio *bbio = rbio->bbio;
				1945	struct bio_list bio_list;
				1946	int ret;
David Sterba	ed6078f	2014-06-05 01:59:57 +0200	[diff] [blame]	1947	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1948	int pagenr;
				1949	int stripe;
				1950	struct bio *bio;
				1951
				1952	bio_list_init(&bio_list);
				1953
				1954	ret = alloc_rbio_pages(rbio);
				1955	if (ret)
				1956	goto cleanup;
				1957
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1958	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1959
				1960	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1961	* read everything that hasn't failed. Thanks to the
				1962	* stripe cache, it is possible that some or all of these
				1963	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1964	*/
				1965	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	1966	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1967	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1968	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	1969	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1970
				1971	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1972	struct page *p;
				1973
				1974	/*
				1975	* the rmw code may have already read this
				1976	* page in
				1977	*/
				1978	p = rbio_stripe_page(rbio, stripe, pagenr);
				1979	if (PageUptodate(p))
				1980	continue;
				1981
				1982	ret = rbio_add_io_page(rbio, &bio_list,
				1983	rbio_stripe_page(rbio, stripe, pagenr),
				1984	stripe, pagenr, rbio->stripe_len);
				1985	if (ret < 0)
				1986	goto cleanup;
				1987	}
				1988	}
				1989
				1990	bios_to_read = bio_list_size(&bio_list);
				1991	if (!bios_to_read) {
				1992	/*
				1993	* we might have no bios to read just because the pages
				1994	* were up to date, or we might have no bios to read because
				1995	* the devices were gone.
				1996	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	1997	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1998	__raid_recover_end_io(rbio);
				1999	goto out;
				2000	} else {
				2001	goto cleanup;
				2002	}
				2003	}
				2004
				2005	/*
				2006	* the bbio may be freed once we submit the last bio. Make sure
				2007	* not to touch it after that
				2008	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame^]	2009	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2010	while (1) {
				2011	bio = bio_list_pop(&bio_list);
				2012	if (!bio)
				2013	break;
				2014
				2015	bio->bi_private = rbio;
				2016	bio->bi_end_io = raid_recover_end_io;
				2017
				2018	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				2019	BTRFS_WQ_ENDIO_RAID56);
				2020
				2021	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				2022	submit_bio(READ, bio);
				2023	}
				2024	out:
				2025	return 0;
				2026
				2027	cleanup:
				2028	if (rbio->read_rebuild)
				2029	rbio_orig_end_io(rbio, -EIO, 0);
				2030	return -EIO;
				2031	}
				2032
				2033	/*
				2034	* the main entry point for reads from the higher layers. This
				2035	* is really only called when the normal read path had a failure,
				2036	* so we assume the bio they send down corresponds to a failed part
				2037	* of the drive.
				2038	*/
				2039	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				2040	struct btrfs_bio bbio, u64 raid_map,
				2041	u64 stripe_len, int mirror_num)
				2042	{
				2043	struct btrfs_raid_bio *rbio;
				2044	int ret;
				2045
				2046	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
Dan Carpenter	3dc0e81	2013-07-22 09:55:15 +0300	[diff] [blame]	2047	if (IS_ERR(rbio))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2048	return PTR_ERR(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2049
				2050	rbio->read_rebuild = 1;
				2051	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2052	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2053
				2054	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2055	if (rbio->faila == -1) {
				2056	BUG();
Dan Carpenter	3dc0e81	2013-07-22 09:55:15 +0300	[diff] [blame]	2057	kfree(raid_map);
				2058	kfree(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2059	kfree(rbio);
				2060	return -EIO;
				2061	}
				2062
				2063	/*
				2064	* reconstruct from the q stripe if they are
				2065	* asking for mirror 3
				2066	*/
				2067	if (mirror_num == 3)
				2068	rbio->failb = bbio->num_stripes - 2;
				2069
				2070	ret = lock_stripe_add(rbio);
				2071
				2072	/*
				2073	* __raid56_parity_recover will end the bio with
				2074	* any errors it hits. We don't want to return
				2075	* its error value up the stack because our caller
				2076	* will end up calling bio_endio with any nonzero
				2077	* return
				2078	*/
				2079	if (ret == 0)
				2080	__raid56_parity_recover(rbio);
				2081	/*
				2082	* our rbio has been added to the list of
				2083	* rbios that will be handled after the
				2084	* currently lock owner is done
				2085	*/
				2086	return 0;
				2087
				2088	}
				2089
				2090	static void rmw_work(struct btrfs_work *work)
				2091	{
				2092	struct btrfs_raid_bio *rbio;
				2093
				2094	rbio = container_of(work, struct btrfs_raid_bio, work);
				2095	raid56_rmw_stripe(rbio);
				2096	}
				2097
				2098	static void read_rebuild_work(struct btrfs_work *work)
				2099	{
				2100	struct btrfs_raid_bio *rbio;
				2101
				2102	rbio = container_of(work, struct btrfs_raid_bio, work);
				2103	__raid56_parity_recover(rbio);
				2104	}