Blame - drivers/md/dm-thin.c - kernel/linux-linaro-stable.git

blob: 301db0f45d3bdbabed9eda3391458fe2aec78fe1 [file] [log] [blame]

Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 Red Hat UK.
				3	*
				4	* This file is released under the GPL.
				5	*/
				6
				7	#include "dm-thin-metadata.h"
				8
				9	#include <linux/device-mapper.h>
				10	#include <linux/dm-io.h>
				11	#include <linux/dm-kcopyd.h>
				12	#include <linux/list.h>
				13	#include <linux/init.h>
				14	#include <linux/module.h>
				15	#include <linux/slab.h>
				16
				17	#define DM_MSG_PREFIX "thin"
				18
				19	/*
				20	* Tunable constants
				21	*/
				22	#define ENDIO_HOOK_POOL_SIZE 10240
				23	#define DEFERRED_SET_SIZE 64
				24	#define MAPPING_POOL_SIZE 1024
				25	#define PRISON_CELLS 1024
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	26	#define COMMIT_PERIOD HZ
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	27
				28	/*
				29	* The block size of the device holding pool data must be
				30	* between 64KB and 1GB.
				31	*/
				32	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
				33	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
				34
				35	/*
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	36	* Device id is restricted to 24 bits.
				37	*/
				38	#define MAX_DEV_ID ((1 << 24) - 1)
				39
				40	/*
				41	* How do we handle breaking sharing of data blocks?
				42	* =================================================
				43	*
				44	* We use a standard copy-on-write btree to store the mappings for the
				45	* devices (note I'm talking about copy-on-write of the metadata here, not
				46	* the data). When you take an internal snapshot you clone the root node
				47	* of the origin btree. After this there is no concept of an origin or a
				48	* snapshot. They are just two device trees that happen to point to the
				49	* same data blocks.
				50	*
				51	* When we get a write in we decide if it's to a shared data block using
				52	* some timestamp magic. If it is, we have to break sharing.
				53	*
				54	* Let's say we write to a shared block in what was the origin. The
				55	* steps are:
				56	*
				57	* i) plug io further to this physical block. (see bio_prison code).
				58	*
				59	* ii) quiesce any read io to that shared data block. Obviously
				60	* including all devices that share this block. (see deferred_set code)
				61	*
				62	* iii) copy the data block to a newly allocate block. This step can be
				63	* missed out if the io covers the block. (schedule_copy).
				64	*
				65	* iv) insert the new mapping into the origin's btree
Joe Thornber	fe878f3	2012-03-28 18:41:24 +0100	[diff] [blame]	66	* (process_prepared_mapping). This act of inserting breaks some
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	67	* sharing of btree nodes between the two devices. Breaking sharing only
				68	* effects the btree of that specific device. Btrees for the other
				69	* devices that share the block never change. The btree for the origin
				70	* device as it was after the last commit is untouched, ie. we're using
				71	* persistent data structures in the functional programming sense.
				72	*
				73	* v) unplug io to this physical block, including the io that triggered
				74	* the breaking of sharing.
				75	*
				76	* Steps (ii) and (iii) occur in parallel.
				77	*
				78	* The metadata _doesn't_ need to be committed before the io continues. We
				79	* get away with this because the io is always written to a _new_ block.
				80	* If there's a crash, then:
				81	*
				82	* - The origin mapping will point to the old origin block (the shared
				83	* one). This will contain the data as it was before the io that triggered
				84	* the breaking of sharing came in.
				85	*
				86	* - The snap mapping still points to the old block. As it would after
				87	* the commit.
				88	*
				89	* The downside of this scheme is the timestamp magic isn't perfect, and
				90	* will continue to think that data block in the snapshot device is shared
				91	* even after the write to the origin has broken sharing. I suspect data
				92	* blocks will typically be shared by many different devices, so we're
				93	* breaking sharing n + 1 times, rather than n, where n is the number of
				94	* devices that reference this data block. At the moment I think the
				95	* benefits far, far outweigh the disadvantages.
				96	*/
				97
				98	/----------------------------------------------------------------/
				99
				100	/*
				101	* Sometimes we can't deal with a bio straight away. We put them in prison
				102	* where they can't cause any mischief. Bios are put in a cell identified
				103	* by a key, multiple bios can be in the same cell. When the cell is
				104	* subsequently unlocked the bios become available.
				105	*/
				106	struct bio_prison;
				107
				108	struct cell_key {
				109	int virtual;
				110	dm_thin_id dev;
				111	dm_block_t block;
				112	};
				113
				114	struct cell {
				115	struct hlist_node list;
				116	struct bio_prison *prison;
				117	struct cell_key key;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	118	struct bio *holder;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	119	struct bio_list bios;
				120	};
				121
				122	struct bio_prison {
				123	spinlock_t lock;
				124	mempool_t *cell_pool;
				125
				126	unsigned nr_buckets;
				127	unsigned hash_mask;
				128	struct hlist_head *cells;
				129	};
				130
				131	static uint32_t calc_nr_buckets(unsigned nr_cells)
				132	{
				133	uint32_t n = 128;
				134
				135	nr_cells /= 4;
				136	nr_cells = min(nr_cells, 8192u);
				137
				138	while (n < nr_cells)
				139	n <<= 1;
				140
				141	return n;
				142	}
				143
				144	/*
				145	* @nr_cells should be the number of cells you want in use _concurrently_.
				146	* Don't confuse it with the number of distinct keys.
				147	*/
				148	static struct bio_prison *prison_create(unsigned nr_cells)
				149	{
				150	unsigned i;
				151	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
				152	size_t len = sizeof(struct bio_prison) +
				153	(sizeof(struct hlist_head) * nr_buckets);
				154	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
				155
				156	if (!prison)
				157	return NULL;
				158
				159	spin_lock_init(&prison->lock);
				160	prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
				161	sizeof(struct cell));
				162	if (!prison->cell_pool) {
				163	kfree(prison);
				164	return NULL;
				165	}
				166
				167	prison->nr_buckets = nr_buckets;
				168	prison->hash_mask = nr_buckets - 1;
				169	prison->cells = (struct hlist_head *) (prison + 1);
				170	for (i = 0; i < nr_buckets; i++)
				171	INIT_HLIST_HEAD(prison->cells + i);
				172
				173	return prison;
				174	}
				175
				176	static void prison_destroy(struct bio_prison *prison)
				177	{
				178	mempool_destroy(prison->cell_pool);
				179	kfree(prison);
				180	}
				181
				182	static uint32_t hash_key(struct bio_prison prison, struct cell_key key)
				183	{
				184	const unsigned long BIG_PRIME = 4294967291UL;
				185	uint64_t hash = key->block * BIG_PRIME;
				186
				187	return (uint32_t) (hash & prison->hash_mask);
				188	}
				189
				190	static int keys_equal(struct cell_key lhs, struct cell_key rhs)
				191	{
				192	return (lhs->virtual == rhs->virtual) &&
				193	(lhs->dev == rhs->dev) &&
				194	(lhs->block == rhs->block);
				195	}
				196
				197	static struct cell __search_bucket(struct hlist_head bucket,
				198	struct cell_key *key)
				199	{
				200	struct cell *cell;
				201	struct hlist_node *tmp;
				202
				203	hlist_for_each_entry(cell, tmp, bucket, list)
				204	if (keys_equal(&cell->key, key))
				205	return cell;
				206
				207	return NULL;
				208	}
				209
				210	/*
				211	* This may block if a new cell needs allocating. You must ensure that
				212	* cells will be unlocked even if the calling thread is blocked.
				213	*
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	214	* Returns 1 if the cell was already held, 0 if @inmate is the new holder.
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	215	*/
				216	static int bio_detain(struct bio_prison prison, struct cell_key key,
				217	struct bio inmate, struct cell *ref)
				218	{
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	219	int r = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	220	unsigned long flags;
				221	uint32_t hash = hash_key(prison, key);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	222	struct cell cell, cell2;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	223
				224	BUG_ON(hash > prison->nr_buckets);
				225
				226	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	227
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	228	cell = __search_bucket(prison->cells + hash, key);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	229	if (cell) {
				230	bio_list_add(&cell->bios, inmate);
				231	goto out;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	232	}
				233
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	234	/*
				235	* Allocate a new cell
				236	*/
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	237	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	238	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
				239	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	240
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	241	/*
				242	* We've been unlocked, so we have to double check that
				243	* nobody else has inserted this cell in the meantime.
				244	*/
				245	cell = __search_bucket(prison->cells + hash, key);
				246	if (cell) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	247	mempool_free(cell2, prison->cell_pool);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	248	bio_list_add(&cell->bios, inmate);
				249	goto out;
				250	}
				251
				252	/*
				253	* Use new cell.
				254	*/
				255	cell = cell2;
				256
				257	cell->prison = prison;
				258	memcpy(&cell->key, key, sizeof(cell->key));
				259	cell->holder = inmate;
				260	bio_list_init(&cell->bios);
				261	hlist_add_head(&cell->list, prison->cells + hash);
				262
				263	r = 0;
				264
				265	out:
				266	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	267
				268	*ref = cell;
				269
				270	return r;
				271	}
				272
				273	/*
				274	* @inmates must have been initialised prior to this call
				275	*/
				276	static void __cell_release(struct cell cell, struct bio_list inmates)
				277	{
				278	struct bio_prison *prison = cell->prison;
				279
				280	hlist_del(&cell->list);
				281
Mike Snitzer	03aaae7	2012-05-12 01:43:12 +0100	[diff] [blame^]	282	if (inmates) {
				283	bio_list_add(inmates, cell->holder);
				284	bio_list_merge(inmates, &cell->bios);
				285	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	286
				287	mempool_free(cell, prison->cell_pool);
				288	}
				289
				290	static void cell_release(struct cell cell, struct bio_list bios)
				291	{
				292	unsigned long flags;
				293	struct bio_prison *prison = cell->prison;
				294
				295	spin_lock_irqsave(&prison->lock, flags);
				296	__cell_release(cell, bios);
				297	spin_unlock_irqrestore(&prison->lock, flags);
				298	}
				299
				300	/*
				301	* There are a couple of places where we put a bio into a cell briefly
				302	* before taking it out again. In these situations we know that no other
				303	* bio may be in the cell. This function releases the cell, and also does
				304	* a sanity check.
				305	*/
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	306	static void __cell_release_singleton(struct cell cell, struct bio bio)
				307	{
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	308	BUG_ON(cell->holder != bio);
				309	BUG_ON(!bio_list_empty(&cell->bios));
Mike Snitzer	03aaae7	2012-05-12 01:43:12 +0100	[diff] [blame^]	310
				311	__cell_release(cell, NULL);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	312	}
				313
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	314	static void cell_release_singleton(struct cell cell, struct bio bio)
				315	{
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	316	unsigned long flags;
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	317	struct bio_prison *prison = cell->prison;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	318
				319	spin_lock_irqsave(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	320	__cell_release_singleton(cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	321	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	322	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	323
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	324	/*
				325	* Sometimes we don't want the holder, just the additional bios.
				326	*/
				327	static void __cell_release_no_holder(struct cell cell, struct bio_list inmates)
				328	{
				329	struct bio_prison *prison = cell->prison;
				330
				331	hlist_del(&cell->list);
				332	bio_list_merge(inmates, &cell->bios);
				333
				334	mempool_free(cell, prison->cell_pool);
				335	}
				336
				337	static void cell_release_no_holder(struct cell cell, struct bio_list inmates)
				338	{
				339	unsigned long flags;
				340	struct bio_prison *prison = cell->prison;
				341
				342	spin_lock_irqsave(&prison->lock, flags);
				343	__cell_release_no_holder(cell, inmates);
				344	spin_unlock_irqrestore(&prison->lock, flags);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	345	}
				346
				347	static void cell_error(struct cell *cell)
				348	{
				349	struct bio_prison *prison = cell->prison;
				350	struct bio_list bios;
				351	struct bio *bio;
				352	unsigned long flags;
				353
				354	bio_list_init(&bios);
				355
				356	spin_lock_irqsave(&prison->lock, flags);
				357	__cell_release(cell, &bios);
				358	spin_unlock_irqrestore(&prison->lock, flags);
				359
				360	while ((bio = bio_list_pop(&bios)))
				361	bio_io_error(bio);
				362	}
				363
				364	/----------------------------------------------------------------/
				365
				366	/*
				367	* We use the deferred set to keep track of pending reads to shared blocks.
				368	* We do this to ensure the new mapping caused by a write isn't performed
				369	* until these prior reads have completed. Otherwise the insertion of the
				370	* new mapping could free the old block that the read bios are mapped to.
				371	*/
				372
				373	struct deferred_set;
				374	struct deferred_entry {
				375	struct deferred_set *ds;
				376	unsigned count;
				377	struct list_head work_items;
				378	};
				379
				380	struct deferred_set {
				381	spinlock_t lock;
				382	unsigned current_entry;
				383	unsigned sweeper;
				384	struct deferred_entry entries[DEFERRED_SET_SIZE];
				385	};
				386
				387	static void ds_init(struct deferred_set *ds)
				388	{
				389	int i;
				390
				391	spin_lock_init(&ds->lock);
				392	ds->current_entry = 0;
				393	ds->sweeper = 0;
				394	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
				395	ds->entries[i].ds = ds;
				396	ds->entries[i].count = 0;
				397	INIT_LIST_HEAD(&ds->entries[i].work_items);
				398	}
				399	}
				400
				401	static struct deferred_entry ds_inc(struct deferred_set ds)
				402	{
				403	unsigned long flags;
				404	struct deferred_entry *entry;
				405
				406	spin_lock_irqsave(&ds->lock, flags);
				407	entry = ds->entries + ds->current_entry;
				408	entry->count++;
				409	spin_unlock_irqrestore(&ds->lock, flags);
				410
				411	return entry;
				412	}
				413
				414	static unsigned ds_next(unsigned index)
				415	{
				416	return (index + 1) % DEFERRED_SET_SIZE;
				417	}
				418
				419	static void __sweep(struct deferred_set ds, struct list_head head)
				420	{
				421	while ((ds->sweeper != ds->current_entry) &&
				422	!ds->entries[ds->sweeper].count) {
				423	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				424	ds->sweeper = ds_next(ds->sweeper);
				425	}
				426
				427	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
				428	list_splice_init(&ds->entries[ds->sweeper].work_items, head);
				429	}
				430
				431	static void ds_dec(struct deferred_entry entry, struct list_head head)
				432	{
				433	unsigned long flags;
				434
				435	spin_lock_irqsave(&entry->ds->lock, flags);
				436	BUG_ON(!entry->count);
				437	--entry->count;
				438	__sweep(entry->ds, head);
				439	spin_unlock_irqrestore(&entry->ds->lock, flags);
				440	}
				441
				442	/*
				443	* Returns 1 if deferred or 0 if no pending items to delay job.
				444	*/
				445	static int ds_add_work(struct deferred_set ds, struct list_head work)
				446	{
				447	int r = 1;
				448	unsigned long flags;
				449	unsigned next_entry;
				450
				451	spin_lock_irqsave(&ds->lock, flags);
				452	if ((ds->sweeper == ds->current_entry) &&
				453	!ds->entries[ds->current_entry].count)
				454	r = 0;
				455	else {
				456	list_add(work, &ds->entries[ds->current_entry].work_items);
				457	next_entry = ds_next(ds->current_entry);
				458	if (!ds->entries[next_entry].count)
				459	ds->current_entry = next_entry;
				460	}
				461	spin_unlock_irqrestore(&ds->lock, flags);
				462
				463	return r;
				464	}
				465
				466	/----------------------------------------------------------------/
				467
				468	/*
				469	* Key building.
				470	*/
				471	static void build_data_key(struct dm_thin_device *td,
				472	dm_block_t b, struct cell_key *key)
				473	{
				474	key->virtual = 0;
				475	key->dev = dm_thin_dev_id(td);
				476	key->block = b;
				477	}
				478
				479	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
				480	struct cell_key *key)
				481	{
				482	key->virtual = 1;
				483	key->dev = dm_thin_dev_id(td);
				484	key->block = b;
				485	}
				486
				487	/----------------------------------------------------------------/
				488
				489	/*
				490	* A pool device ties together a metadata device and a data device. It
				491	* also provides the interface for creating and destroying internal
				492	* devices.
				493	*/
				494	struct new_mapping;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	495
				496	struct pool_features {
				497	unsigned zero_new_blocks:1;
				498	unsigned discard_enabled:1;
				499	unsigned discard_passdown:1;
				500	};
				501
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	502	struct pool {
				503	struct list_head list;
				504	struct dm_target ti; / Only set if a pool target is bound */
				505
				506	struct mapped_device *pool_md;
				507	struct block_device *md_dev;
				508	struct dm_pool_metadata *pmd;
				509
				510	uint32_t sectors_per_block;
				511	unsigned block_shift;
				512	dm_block_t offset_mask;
				513	dm_block_t low_water_blocks;
				514
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	515	struct pool_features pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	516	unsigned low_water_triggered:1; /* A dm event has been sent */
				517	unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
				518
				519	struct bio_prison *prison;
				520	struct dm_kcopyd_client *copier;
				521
				522	struct workqueue_struct *wq;
				523	struct work_struct worker;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	524	struct delayed_work waker;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	525
				526	unsigned ref_count;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	527	unsigned long last_commit_jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	528
				529	spinlock_t lock;
				530	struct bio_list deferred_bios;
				531	struct bio_list deferred_flush_bios;
				532	struct list_head prepared_mappings;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	533	struct list_head prepared_discards;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	534
				535	struct bio_list retry_on_resume_list;
				536
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	537	struct deferred_set shared_read_ds;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	538	struct deferred_set all_io_ds;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	539
				540	struct new_mapping *next_mapping;
				541	mempool_t *mapping_pool;
				542	mempool_t *endio_hook_pool;
				543	};
				544
				545	/*
				546	* Target context for a pool.
				547	*/
				548	struct pool_c {
				549	struct dm_target *ti;
				550	struct pool *pool;
				551	struct dm_dev *data_dev;
				552	struct dm_dev *metadata_dev;
				553	struct dm_target_callbacks callbacks;
				554
				555	dm_block_t low_water_blocks;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	556	struct pool_features pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	557	};
				558
				559	/*
				560	* Target context for a thin.
				561	*/
				562	struct thin_c {
				563	struct dm_dev *pool_dev;
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	564	struct dm_dev *origin_dev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	565	dm_thin_id dev_id;
				566
				567	struct pool *pool;
				568	struct dm_thin_device *td;
				569	};
				570
				571	/----------------------------------------------------------------/
				572
				573	/*
				574	* A global list of pools that uses a struct mapped_device as a key.
				575	*/
				576	static struct dm_thin_pool_table {
				577	struct mutex mutex;
				578	struct list_head pools;
				579	} dm_thin_pool_table;
				580
				581	static void pool_table_init(void)
				582	{
				583	mutex_init(&dm_thin_pool_table.mutex);
				584	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
				585	}
				586
				587	static void __pool_table_insert(struct pool *pool)
				588	{
				589	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				590	list_add(&pool->list, &dm_thin_pool_table.pools);
				591	}
				592
				593	static void __pool_table_remove(struct pool *pool)
				594	{
				595	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				596	list_del(&pool->list);
				597	}
				598
				599	static struct pool __pool_table_lookup(struct mapped_device md)
				600	{
				601	struct pool pool = NULL, tmp;
				602
				603	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				604
				605	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				606	if (tmp->pool_md == md) {
				607	pool = tmp;
				608	break;
				609	}
				610	}
				611
				612	return pool;
				613	}
				614
				615	static struct pool __pool_table_lookup_metadata_dev(struct block_device md_dev)
				616	{
				617	struct pool pool = NULL, tmp;
				618
				619	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				620
				621	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
				622	if (tmp->md_dev == md_dev) {
				623	pool = tmp;
				624	break;
				625	}
				626	}
				627
				628	return pool;
				629	}
				630
				631	/----------------------------------------------------------------/
				632
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	633	struct endio_hook {
				634	struct thin_c *tc;
				635	struct deferred_entry *shared_read_entry;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	636	struct deferred_entry *all_io_entry;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	637	struct new_mapping *overwrite_mapping;
				638	};
				639
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	640	static void __requeue_bio_list(struct thin_c tc, struct bio_list master)
				641	{
				642	struct bio *bio;
				643	struct bio_list bios;
				644
				645	bio_list_init(&bios);
				646	bio_list_merge(&bios, master);
				647	bio_list_init(master);
				648
				649	while ((bio = bio_list_pop(&bios))) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	650	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				651	if (h->tc == tc)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	652	bio_endio(bio, DM_ENDIO_REQUEUE);
				653	else
				654	bio_list_add(master, bio);
				655	}
				656	}
				657
				658	static void requeue_io(struct thin_c *tc)
				659	{
				660	struct pool *pool = tc->pool;
				661	unsigned long flags;
				662
				663	spin_lock_irqsave(&pool->lock, flags);
				664	__requeue_bio_list(tc, &pool->deferred_bios);
				665	__requeue_bio_list(tc, &pool->retry_on_resume_list);
				666	spin_unlock_irqrestore(&pool->lock, flags);
				667	}
				668
				669	/*
				670	* This section of code contains the logic for processing a thin device's IO.
				671	* Much of the code depends on pool object resources (lists, workqueues, etc)
				672	* but most is exclusively called from the thin target rather than the thin-pool
				673	* target.
				674	*/
				675
				676	static dm_block_t get_bio_block(struct thin_c tc, struct bio bio)
				677	{
				678	return bio->bi_sector >> tc->pool->block_shift;
				679	}
				680
				681	static void remap(struct thin_c tc, struct bio bio, dm_block_t block)
				682	{
				683	struct pool *pool = tc->pool;
				684
				685	bio->bi_bdev = tc->pool_dev->bdev;
				686	bio->bi_sector = (block << pool->block_shift) +
				687	(bio->bi_sector & pool->offset_mask);
				688	}
				689
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	690	static void remap_to_origin(struct thin_c tc, struct bio bio)
				691	{
				692	bio->bi_bdev = tc->origin_dev->bdev;
				693	}
				694
				695	static void issue(struct thin_c tc, struct bio bio)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	696	{
				697	struct pool *pool = tc->pool;
				698	unsigned long flags;
				699
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	700	/*
				701	* Batch together any FUA/FLUSH bios we find and then issue
				702	* a single commit for them in process_deferred_bios().
				703	*/
				704	if (bio->bi_rw & (REQ_FLUSH \| REQ_FUA)) {
				705	spin_lock_irqsave(&pool->lock, flags);
				706	bio_list_add(&pool->deferred_flush_bios, bio);
				707	spin_unlock_irqrestore(&pool->lock, flags);
				708	} else
				709	generic_make_request(bio);
				710	}
				711
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	712	static void remap_to_origin_and_issue(struct thin_c tc, struct bio bio)
				713	{
				714	remap_to_origin(tc, bio);
				715	issue(tc, bio);
				716	}
				717
				718	static void remap_and_issue(struct thin_c tc, struct bio bio,
				719	dm_block_t block)
				720	{
				721	remap(tc, bio, block);
				722	issue(tc, bio);
				723	}
				724
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	725	/*
				726	* wake_worker() is used when new work is queued and when pool_resume is
				727	* ready to continue deferred IO processing.
				728	*/
				729	static void wake_worker(struct pool *pool)
				730	{
				731	queue_work(pool->wq, &pool->worker);
				732	}
				733
				734	/----------------------------------------------------------------/
				735
				736	/*
				737	* Bio endio functions.
				738	*/
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	739	struct new_mapping {
				740	struct list_head list;
				741
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	742	unsigned quiesced:1;
				743	unsigned prepared:1;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	744	unsigned pass_discard:1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	745
				746	struct thin_c *tc;
				747	dm_block_t virt_block;
				748	dm_block_t data_block;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	749	struct cell cell, cell2;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	750	int err;
				751
				752	/*
				753	* If the bio covers the whole area of a block then we can avoid
				754	* zeroing or copying. Instead this bio is hooked. The bio will
				755	* still be in the cell, so care has to be taken to avoid issuing
				756	* the bio twice.
				757	*/
				758	struct bio *bio;
				759	bio_end_io_t *saved_bi_end_io;
				760	};
				761
				762	static void __maybe_add_mapping(struct new_mapping *m)
				763	{
				764	struct pool *pool = m->tc->pool;
				765
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	766	if (m->quiesced && m->prepared) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	767	list_add(&m->list, &pool->prepared_mappings);
				768	wake_worker(pool);
				769	}
				770	}
				771
				772	static void copy_complete(int read_err, unsigned long write_err, void *context)
				773	{
				774	unsigned long flags;
				775	struct new_mapping *m = context;
				776	struct pool *pool = m->tc->pool;
				777
				778	m->err = read_err \|\| write_err ? -EIO : 0;
				779
				780	spin_lock_irqsave(&pool->lock, flags);
				781	m->prepared = 1;
				782	__maybe_add_mapping(m);
				783	spin_unlock_irqrestore(&pool->lock, flags);
				784	}
				785
				786	static void overwrite_endio(struct bio *bio, int err)
				787	{
				788	unsigned long flags;
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	789	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				790	struct new_mapping *m = h->overwrite_mapping;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	791	struct pool *pool = m->tc->pool;
				792
				793	m->err = err;
				794
				795	spin_lock_irqsave(&pool->lock, flags);
				796	m->prepared = 1;
				797	__maybe_add_mapping(m);
				798	spin_unlock_irqrestore(&pool->lock, flags);
				799	}
				800
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	801	/----------------------------------------------------------------/
				802
				803	/*
				804	* Workqueue.
				805	*/
				806
				807	/*
				808	* Prepared mapping jobs.
				809	*/
				810
				811	/*
				812	* This sends the bios in the cell back to the deferred_bios list.
				813	*/
				814	static void cell_defer(struct thin_c tc, struct cell cell,
				815	dm_block_t data_block)
				816	{
				817	struct pool *pool = tc->pool;
				818	unsigned long flags;
				819
				820	spin_lock_irqsave(&pool->lock, flags);
				821	cell_release(cell, &pool->deferred_bios);
				822	spin_unlock_irqrestore(&tc->pool->lock, flags);
				823
				824	wake_worker(pool);
				825	}
				826
				827	/*
				828	* Same as cell_defer above, except it omits one particular detainee,
				829	* a write bio that covers the block and has already been processed.
				830	*/
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	831	static void cell_defer_except(struct thin_c tc, struct cell cell)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	832	{
				833	struct bio_list bios;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	834	struct pool *pool = tc->pool;
				835	unsigned long flags;
				836
				837	bio_list_init(&bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	838
				839	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	840	cell_release_no_holder(cell, &pool->deferred_bios);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	841	spin_unlock_irqrestore(&pool->lock, flags);
				842
				843	wake_worker(pool);
				844	}
				845
				846	static void process_prepared_mapping(struct new_mapping *m)
				847	{
				848	struct thin_c *tc = m->tc;
				849	struct bio *bio;
				850	int r;
				851
				852	bio = m->bio;
				853	if (bio)
				854	bio->bi_end_io = m->saved_bi_end_io;
				855
				856	if (m->err) {
				857	cell_error(m->cell);
				858	return;
				859	}
				860
				861	/*
				862	* Commit the prepared block into the mapping btree.
				863	* Any I/O for this block arriving after this point will get
				864	* remapped to it directly.
				865	*/
				866	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
				867	if (r) {
				868	DMERR("dm_thin_insert_block() failed");
				869	cell_error(m->cell);
				870	return;
				871	}
				872
				873	/*
				874	* Release any bios held while the block was being provisioned.
				875	* If we are processing a write bio that completely covers the block,
				876	* we already processed it so can ignore it now when processing
				877	* the bios in the cell.
				878	*/
				879	if (bio) {
Joe Thornber	6f94a4c	2012-03-28 18:41:23 +0100	[diff] [blame]	880	cell_defer_except(tc, m->cell);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	881	bio_endio(bio, 0);
				882	} else
				883	cell_defer(tc, m->cell, m->data_block);
				884
				885	list_del(&m->list);
				886	mempool_free(m, tc->pool->mapping_pool);
				887	}
				888
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	889	static void process_prepared_discard(struct new_mapping *m)
				890	{
				891	int r;
				892	struct thin_c *tc = m->tc;
				893
				894	r = dm_thin_remove_block(tc->td, m->virt_block);
				895	if (r)
				896	DMERR("dm_thin_remove_block() failed");
				897
				898	/*
				899	* Pass the discard down to the underlying device?
				900	*/
				901	if (m->pass_discard)
				902	remap_and_issue(tc, m->bio, m->data_block);
				903	else
				904	bio_endio(m->bio, 0);
				905
				906	cell_defer_except(tc, m->cell);
				907	cell_defer_except(tc, m->cell2);
				908	mempool_free(m, tc->pool->mapping_pool);
				909	}
				910
				911	static void process_prepared(struct pool pool, struct list_head head,
				912	void (fn)(struct new_mapping ))
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	913	{
				914	unsigned long flags;
				915	struct list_head maps;
				916	struct new_mapping m, tmp;
				917
				918	INIT_LIST_HEAD(&maps);
				919	spin_lock_irqsave(&pool->lock, flags);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	920	list_splice_init(head, &maps);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	921	spin_unlock_irqrestore(&pool->lock, flags);
				922
				923	list_for_each_entry_safe(m, tmp, &maps, list)
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	924	fn(m);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	925	}
				926
				927	/*
				928	* Deferred bio jobs.
				929	*/
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	930	static int io_overlaps_block(struct pool pool, struct bio bio)
				931	{
				932	return !(bio->bi_sector & pool->offset_mask) &&
				933	(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
				934
				935	}
				936
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	937	static int io_overwrites_block(struct pool pool, struct bio bio)
				938	{
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	939	return (bio_data_dir(bio) == WRITE) &&
				940	io_overlaps_block(pool, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	941	}
				942
				943	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
				944	bio_end_io_t *fn)
				945	{
				946	*save = bio->bi_end_io;
				947	bio->bi_end_io = fn;
				948	}
				949
				950	static int ensure_next_mapping(struct pool *pool)
				951	{
				952	if (pool->next_mapping)
				953	return 0;
				954
				955	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
				956
				957	return pool->next_mapping ? 0 : -ENOMEM;
				958	}
				959
				960	static struct new_mapping get_next_mapping(struct pool pool)
				961	{
				962	struct new_mapping *r = pool->next_mapping;
				963
				964	BUG_ON(!pool->next_mapping);
				965
				966	pool->next_mapping = NULL;
				967
				968	return r;
				969	}
				970
				971	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	972	struct dm_dev *origin, dm_block_t data_origin,
				973	dm_block_t data_dest,
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	974	struct cell cell, struct bio bio)
				975	{
				976	int r;
				977	struct pool *pool = tc->pool;
				978	struct new_mapping *m = get_next_mapping(pool);
				979
				980	INIT_LIST_HEAD(&m->list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	981	m->quiesced = 0;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	982	m->prepared = 0;
				983	m->tc = tc;
				984	m->virt_block = virt_block;
				985	m->data_block = data_dest;
				986	m->cell = cell;
				987	m->err = 0;
				988	m->bio = NULL;
				989
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	990	if (!ds_add_work(&pool->shared_read_ds, &m->list))
				991	m->quiesced = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	992
				993	/*
				994	* IO to pool_dev remaps to the pool target's data_dev.
				995	*
				996	* If the whole block of data is being overwritten, we can issue the
				997	* bio immediately. Otherwise we use kcopyd to clone the data first.
				998	*/
				999	if (io_overwrites_block(pool, bio)) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1000	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1001	h->overwrite_mapping = m;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1002	m->bio = bio;
				1003	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1004	remap_and_issue(tc, bio, data_dest);
				1005	} else {
				1006	struct dm_io_region from, to;
				1007
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1008	from.bdev = origin->bdev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1009	from.sector = data_origin * pool->sectors_per_block;
				1010	from.count = pool->sectors_per_block;
				1011
				1012	to.bdev = tc->pool_dev->bdev;
				1013	to.sector = data_dest * pool->sectors_per_block;
				1014	to.count = pool->sectors_per_block;
				1015
				1016	r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
				1017	0, copy_complete, m);
				1018	if (r < 0) {
				1019	mempool_free(m, pool->mapping_pool);
				1020	DMERR("dm_kcopyd_copy() failed");
				1021	cell_error(cell);
				1022	}
				1023	}
				1024	}
				1025
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1026	static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
				1027	dm_block_t data_origin, dm_block_t data_dest,
				1028	struct cell cell, struct bio bio)
				1029	{
				1030	schedule_copy(tc, virt_block, tc->pool_dev,
				1031	data_origin, data_dest, cell, bio);
				1032	}
				1033
				1034	static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
				1035	dm_block_t data_dest,
				1036	struct cell cell, struct bio bio)
				1037	{
				1038	schedule_copy(tc, virt_block, tc->origin_dev,
				1039	virt_block, data_dest, cell, bio);
				1040	}
				1041
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1042	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
				1043	dm_block_t data_block, struct cell *cell,
				1044	struct bio *bio)
				1045	{
				1046	struct pool *pool = tc->pool;
				1047	struct new_mapping *m = get_next_mapping(pool);
				1048
				1049	INIT_LIST_HEAD(&m->list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1050	m->quiesced = 1;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1051	m->prepared = 0;
				1052	m->tc = tc;
				1053	m->virt_block = virt_block;
				1054	m->data_block = data_block;
				1055	m->cell = cell;
				1056	m->err = 0;
				1057	m->bio = NULL;
				1058
				1059	/*
				1060	* If the whole block of data is being overwritten or we are not
				1061	* zeroing pre-existing data, we can issue the bio immediately.
				1062	* Otherwise we use kcopyd to zero the data first.
				1063	*/
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1064	if (!pool->pf.zero_new_blocks)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1065	process_prepared_mapping(m);
				1066
				1067	else if (io_overwrites_block(pool, bio)) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1068	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1069	h->overwrite_mapping = m;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1070	m->bio = bio;
				1071	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1072	remap_and_issue(tc, bio, data_block);
				1073
				1074	} else {
				1075	int r;
				1076	struct dm_io_region to;
				1077
				1078	to.bdev = tc->pool_dev->bdev;
				1079	to.sector = data_block * pool->sectors_per_block;
				1080	to.count = pool->sectors_per_block;
				1081
				1082	r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
				1083	if (r < 0) {
				1084	mempool_free(m, pool->mapping_pool);
				1085	DMERR("dm_kcopyd_zero() failed");
				1086	cell_error(cell);
				1087	}
				1088	}
				1089	}
				1090
				1091	static int alloc_data_block(struct thin_c tc, dm_block_t result)
				1092	{
				1093	int r;
				1094	dm_block_t free_blocks;
				1095	unsigned long flags;
				1096	struct pool *pool = tc->pool;
				1097
				1098	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1099	if (r)
				1100	return r;
				1101
				1102	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
				1103	DMWARN("%s: reached low water mark, sending event.",
				1104	dm_device_name(pool->pool_md));
				1105	spin_lock_irqsave(&pool->lock, flags);
				1106	pool->low_water_triggered = 1;
				1107	spin_unlock_irqrestore(&pool->lock, flags);
				1108	dm_table_event(pool->ti->table);
				1109	}
				1110
				1111	if (!free_blocks) {
				1112	if (pool->no_free_space)
				1113	return -ENOSPC;
				1114	else {
				1115	/*
				1116	* Try to commit to see if that will free up some
				1117	* more space.
				1118	*/
				1119	r = dm_pool_commit_metadata(pool->pmd);
				1120	if (r) {
				1121	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1122	__func__, r);
				1123	return r;
				1124	}
				1125
				1126	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
				1127	if (r)
				1128	return r;
				1129
				1130	/*
				1131	* If we still have no space we set a flag to avoid
				1132	* doing all this checking and return -ENOSPC.
				1133	*/
				1134	if (!free_blocks) {
				1135	DMWARN("%s: no free space available.",
				1136	dm_device_name(pool->pool_md));
				1137	spin_lock_irqsave(&pool->lock, flags);
				1138	pool->no_free_space = 1;
				1139	spin_unlock_irqrestore(&pool->lock, flags);
				1140	return -ENOSPC;
				1141	}
				1142	}
				1143	}
				1144
				1145	r = dm_pool_alloc_data_block(pool->pmd, result);
				1146	if (r)
				1147	return r;
				1148
				1149	return 0;
				1150	}
				1151
				1152	/*
				1153	* If we have run out of space, queue bios until the device is
				1154	* resumed, presumably after having been reloaded with more space.
				1155	*/
				1156	static void retry_on_resume(struct bio *bio)
				1157	{
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1158	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1159	struct thin_c *tc = h->tc;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1160	struct pool *pool = tc->pool;
				1161	unsigned long flags;
				1162
				1163	spin_lock_irqsave(&pool->lock, flags);
				1164	bio_list_add(&pool->retry_on_resume_list, bio);
				1165	spin_unlock_irqrestore(&pool->lock, flags);
				1166	}
				1167
				1168	static void no_space(struct cell *cell)
				1169	{
				1170	struct bio *bio;
				1171	struct bio_list bios;
				1172
				1173	bio_list_init(&bios);
				1174	cell_release(cell, &bios);
				1175
				1176	while ((bio = bio_list_pop(&bios)))
				1177	retry_on_resume(bio);
				1178	}
				1179
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1180	static void process_discard(struct thin_c tc, struct bio bio)
				1181	{
				1182	int r;
				1183	struct pool *pool = tc->pool;
				1184	struct cell cell, cell2;
				1185	struct cell_key key, key2;
				1186	dm_block_t block = get_bio_block(tc, bio);
				1187	struct dm_thin_lookup_result lookup_result;
				1188	struct new_mapping *m;
				1189
				1190	build_virtual_key(tc->td, block, &key);
				1191	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1192	return;
				1193
				1194	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1195	switch (r) {
				1196	case 0:
				1197	/*
				1198	* Check nobody is fiddling with this pool block. This can
				1199	* happen if someone's in the process of breaking sharing
				1200	* on this block.
				1201	*/
				1202	build_data_key(tc->td, lookup_result.block, &key2);
				1203	if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
				1204	cell_release_singleton(cell, bio);
				1205	break;
				1206	}
				1207
				1208	if (io_overlaps_block(pool, bio)) {
				1209	/*
				1210	* IO may still be going to the destination block. We must
				1211	* quiesce before we can do the removal.
				1212	*/
				1213	m = get_next_mapping(pool);
				1214	m->tc = tc;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1215	m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1216	m->virt_block = block;
				1217	m->data_block = lookup_result.block;
				1218	m->cell = cell;
				1219	m->cell2 = cell2;
				1220	m->err = 0;
				1221	m->bio = bio;
				1222
				1223	if (!ds_add_work(&pool->all_io_ds, &m->list)) {
				1224	list_add(&m->list, &pool->prepared_discards);
				1225	wake_worker(pool);
				1226	}
				1227	} else {
				1228	/*
				1229	* This path is hit if people are ignoring
				1230	* limits->discard_granularity. It ignores any
				1231	* part of the discard that is in a subsequent
				1232	* block.
				1233	*/
				1234	sector_t offset = bio->bi_sector - (block << pool->block_shift);
				1235	unsigned remaining = (pool->sectors_per_block - offset) << 9;
				1236	bio->bi_size = min(bio->bi_size, remaining);
				1237
				1238	cell_release_singleton(cell, bio);
				1239	cell_release_singleton(cell2, bio);
				1240	remap_and_issue(tc, bio, lookup_result.block);
				1241	}
				1242	break;
				1243
				1244	case -ENODATA:
				1245	/*
				1246	* It isn't provisioned, just forget it.
				1247	*/
				1248	cell_release_singleton(cell, bio);
				1249	bio_endio(bio, 0);
				1250	break;
				1251
				1252	default:
				1253	DMERR("discard: find block unexpectedly returned %d", r);
				1254	cell_release_singleton(cell, bio);
				1255	bio_io_error(bio);
				1256	break;
				1257	}
				1258	}
				1259
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1260	static void break_sharing(struct thin_c tc, struct bio bio, dm_block_t block,
				1261	struct cell_key *key,
				1262	struct dm_thin_lookup_result *lookup_result,
				1263	struct cell *cell)
				1264	{
				1265	int r;
				1266	dm_block_t data_block;
				1267
				1268	r = alloc_data_block(tc, &data_block);
				1269	switch (r) {
				1270	case 0:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1271	schedule_internal_copy(tc, block, lookup_result->block,
				1272	data_block, cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1273	break;
				1274
				1275	case -ENOSPC:
				1276	no_space(cell);
				1277	break;
				1278
				1279	default:
				1280	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1281	cell_error(cell);
				1282	break;
				1283	}
				1284	}
				1285
				1286	static void process_shared_bio(struct thin_c tc, struct bio bio,
				1287	dm_block_t block,
				1288	struct dm_thin_lookup_result *lookup_result)
				1289	{
				1290	struct cell *cell;
				1291	struct pool *pool = tc->pool;
				1292	struct cell_key key;
				1293
				1294	/*
				1295	* If cell is already occupied, then sharing is already in the process
				1296	* of being broken so we have nothing further to do here.
				1297	*/
				1298	build_data_key(tc->td, lookup_result->block, &key);
				1299	if (bio_detain(pool->prison, &key, bio, &cell))
				1300	return;
				1301
				1302	if (bio_data_dir(bio) == WRITE)
				1303	break_sharing(tc, bio, block, &key, lookup_result, cell);
				1304	else {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1305	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1306
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1307	h->shared_read_entry = ds_inc(&pool->shared_read_ds);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1308
				1309	cell_release_singleton(cell, bio);
				1310	remap_and_issue(tc, bio, lookup_result->block);
				1311	}
				1312	}
				1313
				1314	static void provision_block(struct thin_c tc, struct bio bio, dm_block_t block,
				1315	struct cell *cell)
				1316	{
				1317	int r;
				1318	dm_block_t data_block;
				1319
				1320	/*
				1321	* Remap empty bios (flushes) immediately, without provisioning.
				1322	*/
				1323	if (!bio->bi_size) {
				1324	cell_release_singleton(cell, bio);
				1325	remap_and_issue(tc, bio, 0);
				1326	return;
				1327	}
				1328
				1329	/*
				1330	* Fill read bios with zeroes and complete them immediately.
				1331	*/
				1332	if (bio_data_dir(bio) == READ) {
				1333	zero_fill_bio(bio);
				1334	cell_release_singleton(cell, bio);
				1335	bio_endio(bio, 0);
				1336	return;
				1337	}
				1338
				1339	r = alloc_data_block(tc, &data_block);
				1340	switch (r) {
				1341	case 0:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1342	if (tc->origin_dev)
				1343	schedule_external_copy(tc, block, data_block, cell, bio);
				1344	else
				1345	schedule_zero(tc, block, data_block, cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1346	break;
				1347
				1348	case -ENOSPC:
				1349	no_space(cell);
				1350	break;
				1351
				1352	default:
				1353	DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
				1354	cell_error(cell);
				1355	break;
				1356	}
				1357	}
				1358
				1359	static void process_bio(struct thin_c tc, struct bio bio)
				1360	{
				1361	int r;
				1362	dm_block_t block = get_bio_block(tc, bio);
				1363	struct cell *cell;
				1364	struct cell_key key;
				1365	struct dm_thin_lookup_result lookup_result;
				1366
				1367	/*
				1368	* If cell is already occupied, then the block is already
				1369	* being provisioned so we have nothing further to do here.
				1370	*/
				1371	build_virtual_key(tc->td, block, &key);
				1372	if (bio_detain(tc->pool->prison, &key, bio, &cell))
				1373	return;
				1374
				1375	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
				1376	switch (r) {
				1377	case 0:
				1378	/*
				1379	* We can release this cell now. This thread is the only
				1380	* one that puts bios into a cell, and we know there were
				1381	* no preceding bios.
				1382	*/
				1383	/*
				1384	* TODO: this will probably have to change when discard goes
				1385	* back in.
				1386	*/
				1387	cell_release_singleton(cell, bio);
				1388
				1389	if (lookup_result.shared)
				1390	process_shared_bio(tc, bio, block, &lookup_result);
				1391	else
				1392	remap_and_issue(tc, bio, lookup_result.block);
				1393	break;
				1394
				1395	case -ENODATA:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	1396	if (bio_data_dir(bio) == READ && tc->origin_dev) {
				1397	cell_release_singleton(cell, bio);
				1398	remap_to_origin_and_issue(tc, bio);
				1399	} else
				1400	provision_block(tc, bio, block, cell);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1401	break;
				1402
				1403	default:
				1404	DMERR("dm_thin_find_block() failed, error = %d", r);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1405	cell_release_singleton(cell, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1406	bio_io_error(bio);
				1407	break;
				1408	}
				1409	}
				1410
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1411	static int need_commit_due_to_time(struct pool *pool)
				1412	{
				1413	return jiffies < pool->last_commit_jiffies \|\|
				1414	jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
				1415	}
				1416
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1417	static void process_deferred_bios(struct pool *pool)
				1418	{
				1419	unsigned long flags;
				1420	struct bio *bio;
				1421	struct bio_list bios;
				1422	int r;
				1423
				1424	bio_list_init(&bios);
				1425
				1426	spin_lock_irqsave(&pool->lock, flags);
				1427	bio_list_merge(&bios, &pool->deferred_bios);
				1428	bio_list_init(&pool->deferred_bios);
				1429	spin_unlock_irqrestore(&pool->lock, flags);
				1430
				1431	while ((bio = bio_list_pop(&bios))) {
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1432	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
				1433	struct thin_c *tc = h->tc;
				1434
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1435	/*
				1436	* If we've got no free new_mapping structs, and processing
				1437	* this bio might require one, we pause until there are some
				1438	* prepared mappings to process.
				1439	*/
				1440	if (ensure_next_mapping(pool)) {
				1441	spin_lock_irqsave(&pool->lock, flags);
				1442	bio_list_merge(&pool->deferred_bios, &bios);
				1443	spin_unlock_irqrestore(&pool->lock, flags);
				1444
				1445	break;
				1446	}
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1447
				1448	if (bio->bi_rw & REQ_DISCARD)
				1449	process_discard(tc, bio);
				1450	else
				1451	process_bio(tc, bio);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1452	}
				1453
				1454	/*
				1455	* If there are any deferred flush bios, we must commit
				1456	* the metadata before issuing them.
				1457	*/
				1458	bio_list_init(&bios);
				1459	spin_lock_irqsave(&pool->lock, flags);
				1460	bio_list_merge(&bios, &pool->deferred_flush_bios);
				1461	bio_list_init(&pool->deferred_flush_bios);
				1462	spin_unlock_irqrestore(&pool->lock, flags);
				1463
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1464	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1465	return;
				1466
				1467	r = dm_pool_commit_metadata(pool->pmd);
				1468	if (r) {
				1469	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				1470	__func__, r);
				1471	while ((bio = bio_list_pop(&bios)))
				1472	bio_io_error(bio);
				1473	return;
				1474	}
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1475	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1476
				1477	while ((bio = bio_list_pop(&bios)))
				1478	generic_make_request(bio);
				1479	}
				1480
				1481	static void do_worker(struct work_struct *ws)
				1482	{
				1483	struct pool *pool = container_of(ws, struct pool, worker);
				1484
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1485	process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
				1486	process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1487	process_deferred_bios(pool);
				1488	}
				1489
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1490	/*
				1491	* We want to commit periodically so that not too much
				1492	* unwritten data builds up.
				1493	*/
				1494	static void do_waker(struct work_struct *ws)
				1495	{
				1496	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
				1497	wake_worker(pool);
				1498	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
				1499	}
				1500
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1501	/----------------------------------------------------------------/
				1502
				1503	/*
				1504	* Mapping functions.
				1505	*/
				1506
				1507	/*
				1508	* Called only while mapping a thin bio to hand it over to the workqueue.
				1509	*/
				1510	static void thin_defer_bio(struct thin_c tc, struct bio bio)
				1511	{
				1512	unsigned long flags;
				1513	struct pool *pool = tc->pool;
				1514
				1515	spin_lock_irqsave(&pool->lock, flags);
				1516	bio_list_add(&pool->deferred_bios, bio);
				1517	spin_unlock_irqrestore(&pool->lock, flags);
				1518
				1519	wake_worker(pool);
				1520	}
				1521
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1522	static struct endio_hook thin_hook_bio(struct thin_c tc, struct bio *bio)
				1523	{
				1524	struct pool *pool = tc->pool;
				1525	struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
				1526
				1527	h->tc = tc;
				1528	h->shared_read_entry = NULL;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1529	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1530	h->overwrite_mapping = NULL;
				1531
				1532	return h;
				1533	}
				1534
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1535	/*
				1536	* Non-blocking function called from the thin target's map function.
				1537	*/
				1538	static int thin_bio_map(struct dm_target ti, struct bio bio,
				1539	union map_info *map_context)
				1540	{
				1541	int r;
				1542	struct thin_c *tc = ti->private;
				1543	dm_block_t block = get_bio_block(tc, bio);
				1544	struct dm_thin_device *td = tc->td;
				1545	struct dm_thin_lookup_result result;
				1546
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1547	map_context->ptr = thin_hook_bio(tc, bio);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1548	if (bio->bi_rw & (REQ_DISCARD \| REQ_FLUSH \| REQ_FUA)) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1549	thin_defer_bio(tc, bio);
				1550	return DM_MAPIO_SUBMITTED;
				1551	}
				1552
				1553	r = dm_thin_find_block(td, block, 0, &result);
				1554
				1555	/*
				1556	* Note that we defer readahead too.
				1557	*/
				1558	switch (r) {
				1559	case 0:
				1560	if (unlikely(result.shared)) {
				1561	/*
				1562	* We have a race condition here between the
				1563	* result.shared value returned by the lookup and
				1564	* snapshot creation, which may cause new
				1565	* sharing.
				1566	*
				1567	* To avoid this always quiesce the origin before
				1568	* taking the snap. You want to do this anyway to
				1569	* ensure a consistent application view
				1570	* (i.e. lockfs).
				1571	*
				1572	* More distant ancestors are irrelevant. The
				1573	* shared flag will be set in their case.
				1574	*/
				1575	thin_defer_bio(tc, bio);
				1576	r = DM_MAPIO_SUBMITTED;
				1577	} else {
				1578	remap(tc, bio, result.block);
				1579	r = DM_MAPIO_REMAPPED;
				1580	}
				1581	break;
				1582
				1583	case -ENODATA:
				1584	/*
				1585	* In future, the failed dm_thin_find_block above could
				1586	* provide the hint to load the metadata into cache.
				1587	*/
				1588	case -EWOULDBLOCK:
				1589	thin_defer_bio(tc, bio);
				1590	r = DM_MAPIO_SUBMITTED;
				1591	break;
				1592	}
				1593
				1594	return r;
				1595	}
				1596
				1597	static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
				1598	{
				1599	int r;
				1600	unsigned long flags;
				1601	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
				1602
				1603	spin_lock_irqsave(&pt->pool->lock, flags);
				1604	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
				1605	spin_unlock_irqrestore(&pt->pool->lock, flags);
				1606
				1607	if (!r) {
				1608	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				1609	r = bdi_congested(&q->backing_dev_info, bdi_bits);
				1610	}
				1611
				1612	return r;
				1613	}
				1614
				1615	static void __requeue_bios(struct pool *pool)
				1616	{
				1617	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
				1618	bio_list_init(&pool->retry_on_resume_list);
				1619	}
				1620
				1621	/*----------------------------------------------------------------
				1622	* Binding of control targets to a pool object
				1623	--------------------------------------------------------------/
				1624	static int bind_control_target(struct pool pool, struct dm_target ti)
				1625	{
				1626	struct pool_c *pt = ti->private;
				1627
				1628	pool->ti = ti;
				1629	pool->low_water_blocks = pt->low_water_blocks;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1630	pool->pf = pt->pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1631
				1632	return 0;
				1633	}
				1634
				1635	static void unbind_control_target(struct pool pool, struct dm_target ti)
				1636	{
				1637	if (pool->ti == ti)
				1638	pool->ti = NULL;
				1639	}
				1640
				1641	/*----------------------------------------------------------------
				1642	* Pool creation
				1643	--------------------------------------------------------------/
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1644	/* Initialize pool features. */
				1645	static void pool_features_init(struct pool_features *pf)
				1646	{
				1647	pf->zero_new_blocks = 1;
				1648	pf->discard_enabled = 1;
				1649	pf->discard_passdown = 1;
				1650	}
				1651
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1652	static void __pool_destroy(struct pool *pool)
				1653	{
				1654	__pool_table_remove(pool);
				1655
				1656	if (dm_pool_metadata_close(pool->pmd) < 0)
				1657	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1658
				1659	prison_destroy(pool->prison);
				1660	dm_kcopyd_client_destroy(pool->copier);
				1661
				1662	if (pool->wq)
				1663	destroy_workqueue(pool->wq);
				1664
				1665	if (pool->next_mapping)
				1666	mempool_free(pool->next_mapping, pool->mapping_pool);
				1667	mempool_destroy(pool->mapping_pool);
				1668	mempool_destroy(pool->endio_hook_pool);
				1669	kfree(pool);
				1670	}
				1671
				1672	static struct pool pool_create(struct mapped_device pool_md,
				1673	struct block_device *metadata_dev,
				1674	unsigned long block_size, char **error)
				1675	{
				1676	int r;
				1677	void *err_p;
				1678	struct pool *pool;
				1679	struct dm_pool_metadata *pmd;
				1680
				1681	pmd = dm_pool_metadata_open(metadata_dev, block_size);
				1682	if (IS_ERR(pmd)) {
				1683	*error = "Error creating metadata object";
				1684	return (struct pool *)pmd;
				1685	}
				1686
				1687	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
				1688	if (!pool) {
				1689	*error = "Error allocating memory for pool";
				1690	err_p = ERR_PTR(-ENOMEM);
				1691	goto bad_pool;
				1692	}
				1693
				1694	pool->pmd = pmd;
				1695	pool->sectors_per_block = block_size;
				1696	pool->block_shift = ffs(block_size) - 1;
				1697	pool->offset_mask = block_size - 1;
				1698	pool->low_water_blocks = 0;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1699	pool_features_init(&pool->pf);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1700	pool->prison = prison_create(PRISON_CELLS);
				1701	if (!pool->prison) {
				1702	*error = "Error creating pool's bio prison";
				1703	err_p = ERR_PTR(-ENOMEM);
				1704	goto bad_prison;
				1705	}
				1706
				1707	pool->copier = dm_kcopyd_client_create();
				1708	if (IS_ERR(pool->copier)) {
				1709	r = PTR_ERR(pool->copier);
				1710	*error = "Error creating pool's kcopyd client";
				1711	err_p = ERR_PTR(r);
				1712	goto bad_kcopyd_client;
				1713	}
				1714
				1715	/*
				1716	* Create singlethreaded workqueue that will service all devices
				1717	* that use this metadata.
				1718	*/
				1719	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
				1720	if (!pool->wq) {
				1721	*error = "Error creating pool's workqueue";
				1722	err_p = ERR_PTR(-ENOMEM);
				1723	goto bad_wq;
				1724	}
				1725
				1726	INIT_WORK(&pool->worker, do_worker);
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1727	INIT_DELAYED_WORK(&pool->waker, do_waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1728	spin_lock_init(&pool->lock);
				1729	bio_list_init(&pool->deferred_bios);
				1730	bio_list_init(&pool->deferred_flush_bios);
				1731	INIT_LIST_HEAD(&pool->prepared_mappings);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1732	INIT_LIST_HEAD(&pool->prepared_discards);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1733	pool->low_water_triggered = 0;
				1734	pool->no_free_space = 0;
				1735	bio_list_init(&pool->retry_on_resume_list);
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	1736	ds_init(&pool->shared_read_ds);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	1737	ds_init(&pool->all_io_ds);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1738
				1739	pool->next_mapping = NULL;
				1740	pool->mapping_pool =
				1741	mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
				1742	if (!pool->mapping_pool) {
				1743	*error = "Error creating pool's mapping mempool";
				1744	err_p = ERR_PTR(-ENOMEM);
				1745	goto bad_mapping_pool;
				1746	}
				1747
				1748	pool->endio_hook_pool =
				1749	mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
				1750	if (!pool->endio_hook_pool) {
				1751	*error = "Error creating pool's endio_hook mempool";
				1752	err_p = ERR_PTR(-ENOMEM);
				1753	goto bad_endio_hook_pool;
				1754	}
				1755	pool->ref_count = 1;
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	1756	pool->last_commit_jiffies = jiffies;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1757	pool->pool_md = pool_md;
				1758	pool->md_dev = metadata_dev;
				1759	__pool_table_insert(pool);
				1760
				1761	return pool;
				1762
				1763	bad_endio_hook_pool:
				1764	mempool_destroy(pool->mapping_pool);
				1765	bad_mapping_pool:
				1766	destroy_workqueue(pool->wq);
				1767	bad_wq:
				1768	dm_kcopyd_client_destroy(pool->copier);
				1769	bad_kcopyd_client:
				1770	prison_destroy(pool->prison);
				1771	bad_prison:
				1772	kfree(pool);
				1773	bad_pool:
				1774	if (dm_pool_metadata_close(pmd))
				1775	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
				1776
				1777	return err_p;
				1778	}
				1779
				1780	static void __pool_inc(struct pool *pool)
				1781	{
				1782	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1783	pool->ref_count++;
				1784	}
				1785
				1786	static void __pool_dec(struct pool *pool)
				1787	{
				1788	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
				1789	BUG_ON(!pool->ref_count);
				1790	if (!--pool->ref_count)
				1791	__pool_destroy(pool);
				1792	}
				1793
				1794	static struct pool __pool_find(struct mapped_device pool_md,
				1795	struct block_device *metadata_dev,
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1796	unsigned long block_size, char **error,
				1797	int *created)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1798	{
				1799	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
				1800
				1801	if (pool) {
				1802	if (pool->pool_md != pool_md)
				1803	return ERR_PTR(-EBUSY);
				1804	__pool_inc(pool);
				1805
				1806	} else {
				1807	pool = __pool_table_lookup(pool_md);
				1808	if (pool) {
				1809	if (pool->md_dev != metadata_dev)
				1810	return ERR_PTR(-EINVAL);
				1811	__pool_inc(pool);
				1812
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1813	} else {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1814	pool = pool_create(pool_md, metadata_dev, block_size, error);
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1815	*created = 1;
				1816	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1817	}
				1818
				1819	return pool;
				1820	}
				1821
				1822	/*----------------------------------------------------------------
				1823	* Pool target methods
				1824	--------------------------------------------------------------/
				1825	static void pool_dtr(struct dm_target *ti)
				1826	{
				1827	struct pool_c *pt = ti->private;
				1828
				1829	mutex_lock(&dm_thin_pool_table.mutex);
				1830
				1831	unbind_control_target(pt->pool, ti);
				1832	__pool_dec(pt->pool);
				1833	dm_put_device(ti, pt->metadata_dev);
				1834	dm_put_device(ti, pt->data_dev);
				1835	kfree(pt);
				1836
				1837	mutex_unlock(&dm_thin_pool_table.mutex);
				1838	}
				1839
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1840	static int parse_pool_features(struct dm_arg_set as, struct pool_features pf,
				1841	struct dm_target *ti)
				1842	{
				1843	int r;
				1844	unsigned argc;
				1845	const char *arg_name;
				1846
				1847	static struct dm_arg _args[] = {
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1848	{0, 3, "Invalid number of pool feature arguments"},
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1849	};
				1850
				1851	/*
				1852	* No feature arguments supplied.
				1853	*/
				1854	if (!as->argc)
				1855	return 0;
				1856
				1857	r = dm_read_arg_group(_args, as, &argc, &ti->error);
				1858	if (r)
				1859	return -EINVAL;
				1860
				1861	while (argc && !r) {
				1862	arg_name = dm_shift_arg(as);
				1863	argc--;
				1864
				1865	if (!strcasecmp(arg_name, "skip_block_zeroing")) {
				1866	pf->zero_new_blocks = 0;
				1867	continue;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1868	} else if (!strcasecmp(arg_name, "ignore_discard")) {
				1869	pf->discard_enabled = 0;
				1870	continue;
				1871	} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
				1872	pf->discard_passdown = 0;
				1873	continue;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1874	}
				1875
				1876	ti->error = "Unrecognised pool feature requested";
				1877	r = -EINVAL;
				1878	}
				1879
				1880	return r;
				1881	}
				1882
				1883	/*
				1884	* thin-pool <metadata dev> <data dev>
				1885	* <data block size (sectors)>
				1886	* <low water mark (blocks)>
				1887	* [<#feature args> [<arg>]*]
				1888	*
				1889	* Optional feature arguments are:
				1890	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1891	* ignore_discard: disable discard
				1892	* no_discard_passdown: don't pass discards down to the data device
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1893	*/
				1894	static int pool_ctr(struct dm_target ti, unsigned argc, char *argv)
				1895	{
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1896	int r, pool_created = 0;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1897	struct pool_c *pt;
				1898	struct pool *pool;
				1899	struct pool_features pf;
				1900	struct dm_arg_set as;
				1901	struct dm_dev *data_dev;
				1902	unsigned long block_size;
				1903	dm_block_t low_water_blocks;
				1904	struct dm_dev *metadata_dev;
				1905	sector_t metadata_dev_size;
Mike Snitzer	c4a69ec	2012-03-28 18:41:28 +0100	[diff] [blame]	1906	char b[BDEVNAME_SIZE];
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1907
				1908	/*
				1909	* FIXME Remove validation from scope of lock.
				1910	*/
				1911	mutex_lock(&dm_thin_pool_table.mutex);
				1912
				1913	if (argc < 4) {
				1914	ti->error = "Invalid argument count";
				1915	r = -EINVAL;
				1916	goto out_unlock;
				1917	}
				1918	as.argc = argc;
				1919	as.argv = argv;
				1920
				1921	r = dm_get_device(ti, argv[0], FMODE_READ \| FMODE_WRITE, &metadata_dev);
				1922	if (r) {
				1923	ti->error = "Error opening metadata block device";
				1924	goto out_unlock;
				1925	}
				1926
				1927	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
Mike Snitzer	c4a69ec	2012-03-28 18:41:28 +0100	[diff] [blame]	1928	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
				1929	DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
				1930	bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1931
				1932	r = dm_get_device(ti, argv[1], FMODE_READ \| FMODE_WRITE, &data_dev);
				1933	if (r) {
				1934	ti->error = "Error getting data device";
				1935	goto out_metadata;
				1936	}
				1937
				1938	if (kstrtoul(argv[2], 10, &block_size) \|\| !block_size \|\|
				1939	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
				1940	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
				1941	!is_power_of_2(block_size)) {
				1942	ti->error = "Invalid block size";
				1943	r = -EINVAL;
				1944	goto out;
				1945	}
				1946
				1947	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
				1948	ti->error = "Invalid low water mark";
				1949	r = -EINVAL;
				1950	goto out;
				1951	}
				1952
				1953	/*
				1954	* Set default pool features.
				1955	*/
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1956	pool_features_init(&pf);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1957
				1958	dm_consume_args(&as, 4);
				1959	r = parse_pool_features(&as, &pf, ti);
				1960	if (r)
				1961	goto out;
				1962
				1963	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
				1964	if (!pt) {
				1965	r = -ENOMEM;
				1966	goto out;
				1967	}
				1968
				1969	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1970	block_size, &ti->error, &pool_created);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	1971	if (IS_ERR(pool)) {
				1972	r = PTR_ERR(pool);
				1973	goto out_free_pt;
				1974	}
				1975
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	1976	/*
				1977	* 'pool_created' reflects whether this is the first table load.
				1978	* Top level discard support is not allowed to be changed after
				1979	* initial load. This would require a pool reload to trigger thin
				1980	* device changes.
				1981	*/
				1982	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
				1983	ti->error = "Discard support cannot be disabled once enabled";
				1984	r = -EINVAL;
				1985	goto out_flags_changed;
				1986	}
				1987
				1988	/*
				1989	* If discard_passdown was enabled verify that the data device
				1990	* supports discards. Disable discard_passdown if not; otherwise
				1991	* -EOPNOTSUPP will be returned.
				1992	*/
				1993	if (pf.discard_passdown) {
				1994	struct request_queue *q = bdev_get_queue(data_dev->bdev);
				1995	if (!q \|\| !blk_queue_discard(q)) {
				1996	DMWARN("Discard unsupported by data device: Disabling discard passdown.");
				1997	pf.discard_passdown = 0;
				1998	}
				1999	}
				2000
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2001	pt->pool = pool;
				2002	pt->ti = ti;
				2003	pt->metadata_dev = metadata_dev;
				2004	pt->data_dev = data_dev;
				2005	pt->low_water_blocks = low_water_blocks;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2006	pt->pf = pf;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2007	ti->num_flush_requests = 1;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2008	/*
				2009	* Only need to enable discards if the pool should pass
				2010	* them down to the data device. The thin device's discard
				2011	* processing will cause mappings to be removed from the btree.
				2012	*/
				2013	if (pf.discard_enabled && pf.discard_passdown) {
				2014	ti->num_discard_requests = 1;
				2015	/*
				2016	* Setting 'discards_supported' circumvents the normal
				2017	* stacking of discard limits (this keeps the pool and
				2018	* thin devices' discard limits consistent).
				2019	*/
				2020	ti->discards_supported = 1;
				2021	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2022	ti->private = pt;
				2023
				2024	pt->callbacks.congested_fn = pool_is_congested;
				2025	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
				2026
				2027	mutex_unlock(&dm_thin_pool_table.mutex);
				2028
				2029	return 0;
				2030
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2031	out_flags_changed:
				2032	__pool_dec(pool);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2033	out_free_pt:
				2034	kfree(pt);
				2035	out:
				2036	dm_put_device(ti, data_dev);
				2037	out_metadata:
				2038	dm_put_device(ti, metadata_dev);
				2039	out_unlock:
				2040	mutex_unlock(&dm_thin_pool_table.mutex);
				2041
				2042	return r;
				2043	}
				2044
				2045	static int pool_map(struct dm_target ti, struct bio bio,
				2046	union map_info *map_context)
				2047	{
				2048	int r;
				2049	struct pool_c *pt = ti->private;
				2050	struct pool *pool = pt->pool;
				2051	unsigned long flags;
				2052
				2053	/*
				2054	* As this is a singleton target, ti->begin is always zero.
				2055	*/
				2056	spin_lock_irqsave(&pool->lock, flags);
				2057	bio->bi_bdev = pt->data_dev->bdev;
				2058	r = DM_MAPIO_REMAPPED;
				2059	spin_unlock_irqrestore(&pool->lock, flags);
				2060
				2061	return r;
				2062	}
				2063
				2064	/*
				2065	* Retrieves the number of blocks of the data device from
				2066	* the superblock and compares it to the actual device size,
				2067	* thus resizing the data device in case it has grown.
				2068	*
				2069	* This both copes with opening preallocated data devices in the ctr
				2070	* being followed by a resume
				2071	* -and-
				2072	* calling the resume method individually after userspace has
				2073	* grown the data device in reaction to a table event.
				2074	*/
				2075	static int pool_preresume(struct dm_target *ti)
				2076	{
				2077	int r;
				2078	struct pool_c *pt = ti->private;
				2079	struct pool *pool = pt->pool;
				2080	dm_block_t data_size, sb_data_size;
				2081
				2082	/*
				2083	* Take control of the pool object.
				2084	*/
				2085	r = bind_control_target(pool, ti);
				2086	if (r)
				2087	return r;
				2088
				2089	data_size = ti->len >> pool->block_shift;
				2090	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
				2091	if (r) {
				2092	DMERR("failed to retrieve data device size");
				2093	return r;
				2094	}
				2095
				2096	if (data_size < sb_data_size) {
				2097	DMERR("pool target too small, is %llu blocks (expected %llu)",
				2098	data_size, sb_data_size);
				2099	return -EINVAL;
				2100
				2101	} else if (data_size > sb_data_size) {
				2102	r = dm_pool_resize_data_dev(pool->pmd, data_size);
				2103	if (r) {
				2104	DMERR("failed to resize data device");
				2105	return r;
				2106	}
				2107
				2108	r = dm_pool_commit_metadata(pool->pmd);
				2109	if (r) {
				2110	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				2111	__func__, r);
				2112	return r;
				2113	}
				2114	}
				2115
				2116	return 0;
				2117	}
				2118
				2119	static void pool_resume(struct dm_target *ti)
				2120	{
				2121	struct pool_c *pt = ti->private;
				2122	struct pool *pool = pt->pool;
				2123	unsigned long flags;
				2124
				2125	spin_lock_irqsave(&pool->lock, flags);
				2126	pool->low_water_triggered = 0;
				2127	pool->no_free_space = 0;
				2128	__requeue_bios(pool);
				2129	spin_unlock_irqrestore(&pool->lock, flags);
				2130
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	2131	do_waker(&pool->waker.work);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2132	}
				2133
				2134	static void pool_postsuspend(struct dm_target *ti)
				2135	{
				2136	int r;
				2137	struct pool_c *pt = ti->private;
				2138	struct pool *pool = pt->pool;
				2139
Joe Thornber	905e51b	2012-03-28 18:41:27 +0100	[diff] [blame]	2140	cancel_delayed_work(&pool->waker);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2141	flush_workqueue(pool->wq);
				2142
				2143	r = dm_pool_commit_metadata(pool->pmd);
				2144	if (r < 0) {
				2145	DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
				2146	__func__, r);
				2147	/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
				2148	}
				2149	}
				2150
				2151	static int check_arg_count(unsigned argc, unsigned args_required)
				2152	{
				2153	if (argc != args_required) {
				2154	DMWARN("Message received with %u arguments instead of %u.",
				2155	argc, args_required);
				2156	return -EINVAL;
				2157	}
				2158
				2159	return 0;
				2160	}
				2161
				2162	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
				2163	{
				2164	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
				2165	*dev_id <= MAX_DEV_ID)
				2166	return 0;
				2167
				2168	if (warning)
				2169	DMWARN("Message received with invalid device id: %s", arg);
				2170
				2171	return -EINVAL;
				2172	}
				2173
				2174	static int process_create_thin_mesg(unsigned argc, char *argv, struct pool pool)
				2175	{
				2176	dm_thin_id dev_id;
				2177	int r;
				2178
				2179	r = check_arg_count(argc, 2);
				2180	if (r)
				2181	return r;
				2182
				2183	r = read_dev_id(argv[1], &dev_id, 1);
				2184	if (r)
				2185	return r;
				2186
				2187	r = dm_pool_create_thin(pool->pmd, dev_id);
				2188	if (r) {
				2189	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
				2190	argv[1]);
				2191	return r;
				2192	}
				2193
				2194	return 0;
				2195	}
				2196
				2197	static int process_create_snap_mesg(unsigned argc, char *argv, struct pool pool)
				2198	{
				2199	dm_thin_id dev_id;
				2200	dm_thin_id origin_dev_id;
				2201	int r;
				2202
				2203	r = check_arg_count(argc, 3);
				2204	if (r)
				2205	return r;
				2206
				2207	r = read_dev_id(argv[1], &dev_id, 1);
				2208	if (r)
				2209	return r;
				2210
				2211	r = read_dev_id(argv[2], &origin_dev_id, 1);
				2212	if (r)
				2213	return r;
				2214
				2215	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
				2216	if (r) {
				2217	DMWARN("Creation of new snapshot %s of device %s failed.",
				2218	argv[1], argv[2]);
				2219	return r;
				2220	}
				2221
				2222	return 0;
				2223	}
				2224
				2225	static int process_delete_mesg(unsigned argc, char *argv, struct pool pool)
				2226	{
				2227	dm_thin_id dev_id;
				2228	int r;
				2229
				2230	r = check_arg_count(argc, 2);
				2231	if (r)
				2232	return r;
				2233
				2234	r = read_dev_id(argv[1], &dev_id, 1);
				2235	if (r)
				2236	return r;
				2237
				2238	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
				2239	if (r)
				2240	DMWARN("Deletion of thin device %s failed.", argv[1]);
				2241
				2242	return r;
				2243	}
				2244
				2245	static int process_set_transaction_id_mesg(unsigned argc, char *argv, struct pool pool)
				2246	{
				2247	dm_thin_id old_id, new_id;
				2248	int r;
				2249
				2250	r = check_arg_count(argc, 3);
				2251	if (r)
				2252	return r;
				2253
				2254	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
				2255	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
				2256	return -EINVAL;
				2257	}
				2258
				2259	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
				2260	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
				2261	return -EINVAL;
				2262	}
				2263
				2264	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
				2265	if (r) {
				2266	DMWARN("Failed to change transaction id from %s to %s.",
				2267	argv[1], argv[2]);
				2268	return r;
				2269	}
				2270
				2271	return 0;
				2272	}
				2273
				2274	/*
				2275	* Messages supported:
				2276	* create_thin <dev_id>
				2277	* create_snap <dev_id> <origin_id>
				2278	* delete <dev_id>
				2279	* trim <dev_id> <new_size_in_sectors>
				2280	* set_transaction_id <current_trans_id> <new_trans_id>
				2281	*/
				2282	static int pool_message(struct dm_target ti, unsigned argc, char *argv)
				2283	{
				2284	int r = -EINVAL;
				2285	struct pool_c *pt = ti->private;
				2286	struct pool *pool = pt->pool;
				2287
				2288	if (!strcasecmp(argv[0], "create_thin"))
				2289	r = process_create_thin_mesg(argc, argv, pool);
				2290
				2291	else if (!strcasecmp(argv[0], "create_snap"))
				2292	r = process_create_snap_mesg(argc, argv, pool);
				2293
				2294	else if (!strcasecmp(argv[0], "delete"))
				2295	r = process_delete_mesg(argc, argv, pool);
				2296
				2297	else if (!strcasecmp(argv[0], "set_transaction_id"))
				2298	r = process_set_transaction_id_mesg(argc, argv, pool);
				2299
				2300	else
				2301	DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
				2302
				2303	if (!r) {
				2304	r = dm_pool_commit_metadata(pool->pmd);
				2305	if (r)
				2306	DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
				2307	argv[0], r);
				2308	}
				2309
				2310	return r;
				2311	}
				2312
				2313	/*
				2314	* Status line is:
				2315	* <transaction id> <used metadata sectors>/<total metadata sectors>
				2316	* <used data sectors>/<total data sectors> <held metadata root>
				2317	*/
				2318	static int pool_status(struct dm_target *ti, status_type_t type,
				2319	char *result, unsigned maxlen)
				2320	{
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2321	int r, count;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2322	unsigned sz = 0;
				2323	uint64_t transaction_id;
				2324	dm_block_t nr_free_blocks_data;
				2325	dm_block_t nr_free_blocks_metadata;
				2326	dm_block_t nr_blocks_data;
				2327	dm_block_t nr_blocks_metadata;
				2328	dm_block_t held_root;
				2329	char buf[BDEVNAME_SIZE];
				2330	char buf2[BDEVNAME_SIZE];
				2331	struct pool_c *pt = ti->private;
				2332	struct pool *pool = pt->pool;
				2333
				2334	switch (type) {
				2335	case STATUSTYPE_INFO:
				2336	r = dm_pool_get_metadata_transaction_id(pool->pmd,
				2337	&transaction_id);
				2338	if (r)
				2339	return r;
				2340
				2341	r = dm_pool_get_free_metadata_block_count(pool->pmd,
				2342	&nr_free_blocks_metadata);
				2343	if (r)
				2344	return r;
				2345
				2346	r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
				2347	if (r)
				2348	return r;
				2349
				2350	r = dm_pool_get_free_block_count(pool->pmd,
				2351	&nr_free_blocks_data);
				2352	if (r)
				2353	return r;
				2354
				2355	r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
				2356	if (r)
				2357	return r;
				2358
				2359	r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
				2360	if (r)
				2361	return r;
				2362
				2363	DMEMIT("%llu %llu/%llu %llu/%llu ",
				2364	(unsigned long long)transaction_id,
				2365	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
				2366	(unsigned long long)nr_blocks_metadata,
				2367	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
				2368	(unsigned long long)nr_blocks_data);
				2369
				2370	if (held_root)
				2371	DMEMIT("%llu", held_root);
				2372	else
				2373	DMEMIT("-");
				2374
				2375	break;
				2376
				2377	case STATUSTYPE_TABLE:
				2378	DMEMIT("%s %s %lu %llu ",
				2379	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
				2380	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
				2381	(unsigned long)pool->sectors_per_block,
				2382	(unsigned long long)pt->low_water_blocks);
				2383
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2384	count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
				2385	!pool->pf.discard_passdown;
				2386	DMEMIT("%u ", count);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2387
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2388	if (!pool->pf.zero_new_blocks)
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2389	DMEMIT("skip_block_zeroing ");
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2390
				2391	if (!pool->pf.discard_enabled)
				2392	DMEMIT("ignore_discard ");
				2393
				2394	if (!pool->pf.discard_passdown)
				2395	DMEMIT("no_discard_passdown ");
				2396
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2397	break;
				2398	}
				2399
				2400	return 0;
				2401	}
				2402
				2403	static int pool_iterate_devices(struct dm_target *ti,
				2404	iterate_devices_callout_fn fn, void *data)
				2405	{
				2406	struct pool_c *pt = ti->private;
				2407
				2408	return fn(ti, pt->data_dev, 0, ti->len, data);
				2409	}
				2410
				2411	static int pool_merge(struct dm_target ti, struct bvec_merge_data bvm,
				2412	struct bio_vec *biovec, int max_size)
				2413	{
				2414	struct pool_c *pt = ti->private;
				2415	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
				2416
				2417	if (!q->merge_bvec_fn)
				2418	return max_size;
				2419
				2420	bvm->bi_bdev = pt->data_dev->bdev;
				2421
				2422	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
				2423	}
				2424
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2425	static void set_discard_limits(struct pool pool, struct queue_limits limits)
				2426	{
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2427	/*
				2428	* FIXME: these limits may be incompatible with the pool's data device
				2429	*/
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2430	limits->max_discard_sectors = pool->sectors_per_block;
				2431
				2432	/*
				2433	* This is just a hint, and not enforced. We have to cope with
				2434	* bios that overlap 2 blocks.
				2435	*/
				2436	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2437	limits->discard_zeroes_data = pool->pf.zero_new_blocks;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2438	}
				2439
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2440	static void pool_io_hints(struct dm_target ti, struct queue_limits limits)
				2441	{
				2442	struct pool_c *pt = ti->private;
				2443	struct pool *pool = pt->pool;
				2444
				2445	blk_limits_io_min(limits, 0);
				2446	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2447	if (pool->pf.discard_enabled)
				2448	set_discard_limits(pool, limits);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2449	}
				2450
				2451	static struct target_type pool_target = {
				2452	.name = "thin-pool",
				2453	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
				2454	DM_TARGET_IMMUTABLE,
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2455	.version = {1, 1, 0},
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2456	.module = THIS_MODULE,
				2457	.ctr = pool_ctr,
				2458	.dtr = pool_dtr,
				2459	.map = pool_map,
				2460	.postsuspend = pool_postsuspend,
				2461	.preresume = pool_preresume,
				2462	.resume = pool_resume,
				2463	.message = pool_message,
				2464	.status = pool_status,
				2465	.merge = pool_merge,
				2466	.iterate_devices = pool_iterate_devices,
				2467	.io_hints = pool_io_hints,
				2468	};
				2469
				2470	/*----------------------------------------------------------------
				2471	* Thin target methods
				2472	--------------------------------------------------------------/
				2473	static void thin_dtr(struct dm_target *ti)
				2474	{
				2475	struct thin_c *tc = ti->private;
				2476
				2477	mutex_lock(&dm_thin_pool_table.mutex);
				2478
				2479	__pool_dec(tc->pool);
				2480	dm_pool_close_thin_device(tc->td);
				2481	dm_put_device(ti, tc->pool_dev);
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2482	if (tc->origin_dev)
				2483	dm_put_device(ti, tc->origin_dev);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2484	kfree(tc);
				2485
				2486	mutex_unlock(&dm_thin_pool_table.mutex);
				2487	}
				2488
				2489	/*
				2490	* Thin target parameters:
				2491	*
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2492	* <pool_dev> <dev_id> [origin_dev]
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2493	*
				2494	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
				2495	* dev_id: the internal device identifier
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2496	* origin_dev: a device external to the pool that should act as the origin
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2497	*
				2498	* If the pool device has discards disabled, they get disabled for the thin
				2499	* device as well.
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2500	*/
				2501	static int thin_ctr(struct dm_target ti, unsigned argc, char *argv)
				2502	{
				2503	int r;
				2504	struct thin_c *tc;
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2505	struct dm_dev pool_dev, origin_dev;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2506	struct mapped_device *pool_md;
				2507
				2508	mutex_lock(&dm_thin_pool_table.mutex);
				2509
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2510	if (argc != 2 && argc != 3) {
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2511	ti->error = "Invalid argument count";
				2512	r = -EINVAL;
				2513	goto out_unlock;
				2514	}
				2515
				2516	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
				2517	if (!tc) {
				2518	ti->error = "Out of memory";
				2519	r = -ENOMEM;
				2520	goto out_unlock;
				2521	}
				2522
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2523	if (argc == 3) {
				2524	r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
				2525	if (r) {
				2526	ti->error = "Error opening origin device";
				2527	goto bad_origin_dev;
				2528	}
				2529	tc->origin_dev = origin_dev;
				2530	}
				2531
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2532	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
				2533	if (r) {
				2534	ti->error = "Error opening pool device";
				2535	goto bad_pool_dev;
				2536	}
				2537	tc->pool_dev = pool_dev;
				2538
				2539	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
				2540	ti->error = "Invalid device id";
				2541	r = -EINVAL;
				2542	goto bad_common;
				2543	}
				2544
				2545	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
				2546	if (!pool_md) {
				2547	ti->error = "Couldn't get pool mapped device";
				2548	r = -EINVAL;
				2549	goto bad_common;
				2550	}
				2551
				2552	tc->pool = __pool_table_lookup(pool_md);
				2553	if (!tc->pool) {
				2554	ti->error = "Couldn't find pool object";
				2555	r = -EINVAL;
				2556	goto bad_pool_lookup;
				2557	}
				2558	__pool_inc(tc->pool);
				2559
				2560	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
				2561	if (r) {
				2562	ti->error = "Couldn't open thin internal device";
				2563	goto bad_thin_open;
				2564	}
				2565
				2566	ti->split_io = tc->pool->sectors_per_block;
				2567	ti->num_flush_requests = 1;
Joe Thornber	67e2e2b	2012-03-28 18:41:29 +0100	[diff] [blame]	2568
				2569	/* In case the pool supports discards, pass them on. */
				2570	if (tc->pool->pf.discard_enabled) {
				2571	ti->discards_supported = 1;
				2572	ti->num_discard_requests = 1;
				2573	}
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2574
				2575	dm_put(pool_md);
				2576
				2577	mutex_unlock(&dm_thin_pool_table.mutex);
				2578
				2579	return 0;
				2580
				2581	bad_thin_open:
				2582	__pool_dec(tc->pool);
				2583	bad_pool_lookup:
				2584	dm_put(pool_md);
				2585	bad_common:
				2586	dm_put_device(ti, tc->pool_dev);
				2587	bad_pool_dev:
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2588	if (tc->origin_dev)
				2589	dm_put_device(ti, tc->origin_dev);
				2590	bad_origin_dev:
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2591	kfree(tc);
				2592	out_unlock:
				2593	mutex_unlock(&dm_thin_pool_table.mutex);
				2594
				2595	return r;
				2596	}
				2597
				2598	static int thin_map(struct dm_target ti, struct bio bio,
				2599	union map_info *map_context)
				2600	{
Alasdair G Kergon	6efd6e8	2012-03-28 18:41:28 +0100	[diff] [blame]	2601	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2602
				2603	return thin_bio_map(ti, bio, map_context);
				2604	}
				2605
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2606	static int thin_endio(struct dm_target *ti,
				2607	struct bio *bio, int err,
				2608	union map_info *map_context)
				2609	{
				2610	unsigned long flags;
				2611	struct endio_hook *h = map_context->ptr;
				2612	struct list_head work;
				2613	struct new_mapping m, tmp;
				2614	struct pool *pool = h->tc->pool;
				2615
				2616	if (h->shared_read_entry) {
				2617	INIT_LIST_HEAD(&work);
				2618	ds_dec(h->shared_read_entry, &work);
				2619
				2620	spin_lock_irqsave(&pool->lock, flags);
				2621	list_for_each_entry_safe(m, tmp, &work, list) {
				2622	list_del(&m->list);
				2623	m->quiesced = 1;
				2624	__maybe_add_mapping(m);
				2625	}
				2626	spin_unlock_irqrestore(&pool->lock, flags);
				2627	}
				2628
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2629	if (h->all_io_entry) {
				2630	INIT_LIST_HEAD(&work);
				2631	ds_dec(h->all_io_entry, &work);
				2632	list_for_each_entry_safe(m, tmp, &work, list)
				2633	list_add(&m->list, &pool->prepared_discards);
				2634	}
				2635
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2636	mempool_free(h, pool->endio_hook_pool);
				2637
				2638	return 0;
				2639	}
				2640
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2641	static void thin_postsuspend(struct dm_target *ti)
				2642	{
				2643	if (dm_noflush_suspending(ti))
				2644	requeue_io((struct thin_c *)ti->private);
				2645	}
				2646
				2647	/*
				2648	* <nr mapped sectors> <highest mapped sector>
				2649	*/
				2650	static int thin_status(struct dm_target *ti, status_type_t type,
				2651	char *result, unsigned maxlen)
				2652	{
				2653	int r;
				2654	ssize_t sz = 0;
				2655	dm_block_t mapped, highest;
				2656	char buf[BDEVNAME_SIZE];
				2657	struct thin_c *tc = ti->private;
				2658
				2659	if (!tc->td)
				2660	DMEMIT("-");
				2661	else {
				2662	switch (type) {
				2663	case STATUSTYPE_INFO:
				2664	r = dm_thin_get_mapped_count(tc->td, &mapped);
				2665	if (r)
				2666	return r;
				2667
				2668	r = dm_thin_get_highest_mapped_block(tc->td, &highest);
				2669	if (r < 0)
				2670	return r;
				2671
				2672	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
				2673	if (r)
				2674	DMEMIT("%llu", ((highest + 1) *
				2675	tc->pool->sectors_per_block) - 1);
				2676	else
				2677	DMEMIT("-");
				2678	break;
				2679
				2680	case STATUSTYPE_TABLE:
				2681	DMEMIT("%s %lu",
				2682	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
				2683	(unsigned long) tc->dev_id);
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2684	if (tc->origin_dev)
				2685	DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2686	break;
				2687	}
				2688	}
				2689
				2690	return 0;
				2691	}
				2692
				2693	static int thin_iterate_devices(struct dm_target *ti,
				2694	iterate_devices_callout_fn fn, void *data)
				2695	{
				2696	dm_block_t blocks;
				2697	struct thin_c *tc = ti->private;
				2698
				2699	/*
				2700	* We can't call dm_pool_get_data_dev_size() since that blocks. So
				2701	* we follow a more convoluted path through to the pool's target.
				2702	*/
				2703	if (!tc->pool->ti)
				2704	return 0; /* nothing is bound */
				2705
				2706	blocks = tc->pool->ti->len >> tc->pool->block_shift;
				2707	if (blocks)
				2708	return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
				2709
				2710	return 0;
				2711	}
				2712
				2713	static void thin_io_hints(struct dm_target ti, struct queue_limits limits)
				2714	{
				2715	struct thin_c *tc = ti->private;
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2716	struct pool *pool = tc->pool;
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2717
				2718	blk_limits_io_min(limits, 0);
Joe Thornber	104655f	2012-03-28 18:41:28 +0100	[diff] [blame]	2719	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
				2720	set_discard_limits(pool, limits);
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2721	}
				2722
				2723	static struct target_type thin_target = {
				2724	.name = "thin",
Joe Thornber	2dd9c25	2012-03-28 18:41:28 +0100	[diff] [blame]	2725	.version = {1, 1, 0},
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2726	.module = THIS_MODULE,
				2727	.ctr = thin_ctr,
				2728	.dtr = thin_dtr,
				2729	.map = thin_map,
Joe Thornber	eb2aa48	2012-03-28 18:41:28 +0100	[diff] [blame]	2730	.end_io = thin_endio,
Joe Thornber	991d9fa	2011-10-31 20:21:18 +0000	[diff] [blame]	2731	.postsuspend = thin_postsuspend,
				2732	.status = thin_status,
				2733	.iterate_devices = thin_iterate_devices,
				2734	.io_hints = thin_io_hints,
				2735	};
				2736
				2737	/----------------------------------------------------------------/
				2738
				2739	static int __init dm_thin_init(void)
				2740	{
				2741	int r;
				2742
				2743	pool_table_init();
				2744
				2745	r = dm_register_target(&thin_target);
				2746	if (r)
				2747	return r;
				2748
				2749	r = dm_register_target(&pool_target);
				2750	if (r)
				2751	dm_unregister_target(&thin_target);
				2752
				2753	return r;
				2754	}
				2755
				2756	static void dm_thin_exit(void)
				2757	{
				2758	dm_unregister_target(&thin_target);
				2759	dm_unregister_target(&pool_target);
				2760	}
				2761
				2762	module_init(dm_thin_init);
				2763	module_exit(dm_thin_exit);
				2764
				2765	MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
				2766	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
				2767	MODULE_LICENSE("GPL");