Blame - fs/buffer.c - kernel/linux-linaro-stable.git

blob: ded29b0fdac3a0db8ee6be49f7a970e3a42dce6b [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	27	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	28	#include <linux/blkdev.h>
				29	#include <linux/file.h>
				30	#include <linux/quotaops.h>
				31	#include <linux/highmem.h>
				32	#include <linux/module.h>
				33	#include <linux/writeback.h>
				34	#include <linux/hash.h>
				35	#include <linux/suspend.h>
				36	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	37	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	43	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	44
				45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	46
				47	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				48
				49	inline void
				50	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				51	{
				52	bh->b_end_io = handler;
				53	bh->b_private = private;
				54	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	55	EXPORT_SYMBOL(init_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	56
				57	static int sync_buffer(void *word)
				58	{
				59	struct block_device *bd;
				60	struct buffer_head *bh
				61	= container_of(word, struct buffer_head, b_state);
				62
				63	smp_mb();
				64	bd = bh->b_bdev;
				65	if (bd)
				66	blk_run_address_space(bd->bd_inode->i_mapping);
				67	io_schedule();
				68	return 0;
				69	}
				70
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	71	void __lock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	72	{
				73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				74	TASK_UNINTERRUPTIBLE);
				75	}
				76	EXPORT_SYMBOL(__lock_buffer);
				77
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	78	void unlock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	79	{
Nick Piggin	51b07fc	2008-10-18 20:27:00 -0700	[diff] [blame]	80	clear_bit_unlock(BH_Lock, &bh->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	81	smp_mb__after_clear_bit();
				82	wake_up_bit(&bh->b_state, BH_Lock);
				83	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	84	EXPORT_SYMBOL(unlock_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	85
				86	/*
				87	* Block until a buffer comes unlocked. This doesn't stop it
				88	* from becoming locked again - you have to lock it yourself
				89	* if you want to preserve its state.
				90	*/
				91	void __wait_on_buffer(struct buffer_head * bh)
				92	{
				93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				94	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	95	EXPORT_SYMBOL(__wait_on_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	96
				97	static void
				98	__clear_page_buffers(struct page *page)
				99	{
				100	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	101	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	102	page_cache_release(page);
				103	}
				104
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	105
				106	static int quiet_error(struct buffer_head *bh)
				107	{
				108	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
				109	return 0;
				110	return 1;
				111	}
				112
				113
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	114	static void buffer_io_error(struct buffer_head *bh)
				115	{
				116	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	117	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				118	bdevname(bh->b_bdev, b),
				119	(unsigned long long)bh->b_blocknr);
				120	}
				121
				122	/*
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	123	* End-of-IO handler helper function which does not touch the bh after
				124	* unlocking it.
				125	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				126	* a race there is benign: unlock_buffer() only use the bh's address for
				127	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				128	* itself.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	129	*/
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	130	static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	131	{
				132	if (uptodate) {
				133	set_buffer_uptodate(bh);
				134	} else {
				135	/* This happens, due to failed READA attempts. */
				136	clear_buffer_uptodate(bh);
				137	}
				138	unlock_buffer(bh);
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	139	}
				140
				141	/*
				142	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				143	* unlock the buffer. This is what ll_rw_block uses too.
				144	*/
				145	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				146	{
				147	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	148	put_bh(bh);
				149	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	150	EXPORT_SYMBOL(end_buffer_read_sync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	151
				152	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				153	{
				154	char b[BDEVNAME_SIZE];
				155
				156	if (uptodate) {
				157	set_buffer_uptodate(bh);
				158	} else {
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	159	if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	160	buffer_io_error(bh);
				161	printk(KERN_WARNING "lost page write due to "
				162	"I/O error on %s\n",
				163	bdevname(bh->b_bdev, b));
				164	}
				165	set_buffer_write_io_error(bh);
				166	clear_buffer_uptodate(bh);
				167	}
				168	unlock_buffer(bh);
				169	put_bh(bh);
				170	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	171	EXPORT_SYMBOL(end_buffer_write_sync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	172
				173	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	174	* Various filesystems appear to want __find_get_block to be non-blocking.
				175	* But it's the page lock which protects the buffers. To get around this,
				176	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				177	* private_lock.
				178	*
				179	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				180	* may be quite high. This code could TryLock the page, and if that
				181	* succeeds, there is no need to take private_lock. (But if
				182	* private_lock is contended then so is mapping->tree_lock).
				183	*/
				184	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	185	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	186	{
				187	struct inode *bd_inode = bdev->bd_inode;
				188	struct address_space *bd_mapping = bd_inode->i_mapping;
				189	struct buffer_head *ret = NULL;
				190	pgoff_t index;
				191	struct buffer_head *bh;
				192	struct buffer_head *head;
				193	struct page *page;
				194	int all_mapped = 1;
				195
				196	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				197	page = find_get_page(bd_mapping, index);
				198	if (!page)
				199	goto out;
				200
				201	spin_lock(&bd_mapping->private_lock);
				202	if (!page_has_buffers(page))
				203	goto out_unlock;
				204	head = page_buffers(page);
				205	bh = head;
				206	do {
Nikanth Karthikesan	97f76d3	2009-04-02 16:56:46 -0700	[diff] [blame]	207	if (!buffer_mapped(bh))
				208	all_mapped = 0;
				209	else if (bh->b_blocknr == block) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	210	ret = bh;
				211	get_bh(bh);
				212	goto out_unlock;
				213	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	214	bh = bh->b_this_page;
				215	} while (bh != head);
				216
				217	/* we might be here because some of the buffers on this page are
				218	* not mapped. This is due to various races between
				219	* file io on the block device and getblk. It gets dealt with
				220	* elsewhere, don't buffer_error if we had some unmapped buffers
				221	*/
				222	if (all_mapped) {
				223	printk("__find_get_block_slow() failed. "
				224	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	225	(unsigned long long)block,
				226	(unsigned long long)bh->b_blocknr);
				227	printk("b_state=0x%08lx, b_size=%zu\n",
				228	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	229	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				230	}
				231	out_unlock:
				232	spin_unlock(&bd_mapping->private_lock);
				233	page_cache_release(page);
				234	out:
				235	return ret;
				236	}
				237
				238	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				239	of fs corruption is going on. Trashing dirty data always imply losing
				240	information that was supposed to be just stored on the physical layer
				241	by the user.
				242
				243	Thus invalidate_buffers in general usage is not allwowed to trash
				244	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				245	be preserved. These buffers are simply skipped.
				246
				247	We also skip buffers which are still in use. For example this can
				248	happen if a userspace program is reading the block device.
				249
				250	NOTE: In the case where the user removed a removable-media-disk even if
				251	there's still dirty data not synced on disk (due a bug in the device driver
				252	or due an error of the user), by not destroying the dirty buffers we could
				253	generate corruption also on the next media inserted, thus a parameter is
				254	necessary to handle this case in the most safe way possible (trying
				255	to not corrupt also the new disk inserted with the data belonging to
				256	the old now corrupted disk). Also for the ramdisk the natural thing
				257	to do in order to release the ramdisk memory is to destroy dirty buffers.
				258
				259	These are two special cases. Normal usage imply the device driver
				260	to issue a sync on the device (without waiting I/O completion) and
				261	then an invalidate_buffers call that doesn't trash dirty buffers.
				262
				263	For handling cache coherency with the blkdev pagecache the 'update' case
				264	is been introduced. It is needed to re-read from disk any pinned
				265	buffer. NOTE: re-reading from disk is destructive so we can do it only
				266	when we assume nobody is changing the buffercache under our I/O and when
				267	we think the disk contains more recent information than the buffercache.
				268	The update == 1 pass marks the buffers we need to update, the update == 2
				269	pass does the actual I/O. */
Peter Zijlstra	f98393a	2007-05-06 14:49:54 -0700	[diff] [blame]	270	void invalidate_bdev(struct block_device *bdev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	271	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	272	struct address_space *mapping = bdev->bd_inode->i_mapping;
				273
				274	if (mapping->nrpages == 0)
				275	return;
				276
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	277	invalidate_bh_lrus();
Andrew Morton	fc0ecff	2007-02-10 01:45:39 -0800	[diff] [blame]	278	invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	279	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	280	EXPORT_SYMBOL(invalidate_bdev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	281
				282	/*
Jens Axboe	5b0830c	2009-09-23 19:37:09 +0200	[diff] [blame]	283	* Kick the writeback threads then try to free up some ZONE_NORMAL memory.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	284	*/
				285	static void free_more_memory(void)
				286	{
Mel Gorman	19770b3	2008-04-28 02:12:18 -0700	[diff] [blame]	287	struct zone *zone;
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	288	int nid;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	289
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	290	wakeup_flusher_threads(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	291	yield();
				292
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	293	for_each_online_node(nid) {
Mel Gorman	19770b3	2008-04-28 02:12:18 -0700	[diff] [blame]	294	(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
				295	gfp_zone(GFP_NOFS), NULL,
				296	&zone);
				297	if (zone)
Mel Gorman	54a6eb5	2008-04-28 02:12:16 -0700	[diff] [blame]	298	try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
KAMEZAWA Hiroyuki	327c0e9	2009-03-31 15:23:31 -0700	[diff] [blame]	299	GFP_NOFS, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	300	}
				301	}
				302
				303	/*
				304	* I/O completion handler for block_read_full_page() - pages
				305	* which come unlocked at the end of I/O.
				306	*/
				307	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				308	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	309	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	310	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	311	struct buffer_head *tmp;
				312	struct page *page;
				313	int page_uptodate = 1;
				314
				315	BUG_ON(!buffer_async_read(bh));
				316
				317	page = bh->b_page;
				318	if (uptodate) {
				319	set_buffer_uptodate(bh);
				320	} else {
				321	clear_buffer_uptodate(bh);
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	322	if (!quiet_error(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	323	buffer_io_error(bh);
				324	SetPageError(page);
				325	}
				326
				327	/*
				328	* Be _very_ careful from here on. Bad things can happen if
				329	* two buffer heads end IO at almost the same time and both
				330	* decide that the page is now completely done.
				331	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	332	first = page_buffers(page);
				333	local_irq_save(flags);
				334	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	335	clear_buffer_async_read(bh);
				336	unlock_buffer(bh);
				337	tmp = bh;
				338	do {
				339	if (!buffer_uptodate(tmp))
				340	page_uptodate = 0;
				341	if (buffer_async_read(tmp)) {
				342	BUG_ON(!buffer_locked(tmp));
				343	goto still_busy;
				344	}
				345	tmp = tmp->b_this_page;
				346	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	347	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				348	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	349
				350	/*
				351	* If none of the buffers had errors and they are all
				352	* uptodate then we can set the page uptodate.
				353	*/
				354	if (page_uptodate && !PageError(page))
				355	SetPageUptodate(page);
				356	unlock_page(page);
				357	return;
				358
				359	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	360	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				361	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	362	return;
				363	}
				364
				365	/*
				366	* Completion handler for block_write_full_page() - pages which are unlocked
				367	* during I/O, and which have PageWriteback cleared upon I/O completion.
				368	*/
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	369	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	370	{
				371	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	372	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	373	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	374	struct buffer_head *tmp;
				375	struct page *page;
				376
				377	BUG_ON(!buffer_async_write(bh));
				378
				379	page = bh->b_page;
				380	if (uptodate) {
				381	set_buffer_uptodate(bh);
				382	} else {
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	383	if (!quiet_error(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	384	buffer_io_error(bh);
				385	printk(KERN_WARNING "lost page write due to "
				386	"I/O error on %s\n",
				387	bdevname(bh->b_bdev, b));
				388	}
				389	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	390	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	391	clear_buffer_uptodate(bh);
				392	SetPageError(page);
				393	}
				394
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	395	first = page_buffers(page);
				396	local_irq_save(flags);
				397	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				398
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	399	clear_buffer_async_write(bh);
				400	unlock_buffer(bh);
				401	tmp = bh->b_this_page;
				402	while (tmp != bh) {
				403	if (buffer_async_write(tmp)) {
				404	BUG_ON(!buffer_locked(tmp));
				405	goto still_busy;
				406	}
				407	tmp = tmp->b_this_page;
				408	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	409	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				410	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	411	end_page_writeback(page);
				412	return;
				413
				414	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	415	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				416	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	417	return;
				418	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	419	EXPORT_SYMBOL(end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	420
				421	/*
				422	* If a page's buffers are under async readin (end_buffer_async_read
				423	* completion) then there is a possibility that another thread of
				424	* control could lock one of the buffers after it has completed
				425	* but while some of the other buffers have not completed. This
				426	* locked buffer would confuse end_buffer_async_read() into not unlocking
				427	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				428	* that this buffer is not under async I/O.
				429	*
				430	* The page comes unlocked when it has no locked buffer_async buffers
				431	* left.
				432	*
				433	* PageLocked prevents anyone starting new async I/O reads any of
				434	* the buffers.
				435	*
				436	* PageWriteback is used to prevent simultaneous writeout of the same
				437	* page.
				438	*
				439	* PageLocked prevents anyone from starting writeback of a page which is
				440	* under read I/O (PageWriteback is only ever set against a locked page).
				441	*/
				442	static void mark_buffer_async_read(struct buffer_head *bh)
				443	{
				444	bh->b_end_io = end_buffer_async_read;
				445	set_buffer_async_read(bh);
				446	}
				447
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	448	static void mark_buffer_async_write_endio(struct buffer_head *bh,
				449	bh_end_io_t *handler)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	450	{
				451	bh->b_end_io = handler;
				452	set_buffer_async_write(bh);
				453	}
				454
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	455	void mark_buffer_async_write(struct buffer_head *bh)
				456	{
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	457	mark_buffer_async_write_endio(bh, end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	458	}
				459	EXPORT_SYMBOL(mark_buffer_async_write);
				460
				461
				462	/*
				463	* fs/buffer.c contains helper functions for buffer-backed address space's
				464	* fsync functions. A common requirement for buffer-based filesystems is
				465	* that certain data from the backing blockdev needs to be written out for
				466	* a successful fsync(). For example, ext2 indirect blocks need to be
				467	* written back and waited upon before fsync() returns.
				468	*
				469	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				470	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				471	* management of a list of dependent buffers at ->i_mapping->private_list.
				472	*
				473	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				474	* from their controlling inode's queue when they are being freed. But
				475	* try_to_free_buffers() will be operating against the blockdev mapping
				476	* at the time, not against the S_ISREG file which depends on those buffers.
				477	* So the locking for private_list is via the private_lock in the address_space
				478	* which backs the buffers. Which is different from the address_space
				479	* against which the buffers are listed. So for a particular address_space,
				480	* mapping->private_lock does not protect mapping->private_list! In fact,
				481	* mapping->private_list will always be protected by the backing blockdev's
				482	* ->private_lock.
				483	*
				484	* Which introduces a requirement: all buffers on an address_space's
				485	* ->private_list must be from the same address_space: the blockdev's.
				486	*
				487	* address_spaces which do not place buffers at ->private_list via these
				488	* utility functions are free to use private_lock and private_list for
				489	* whatever they want. The only requirement is that list_empty(private_list)
				490	* be true at clear_inode() time.
				491	*
				492	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				493	* filesystems should do that. invalidate_inode_buffers() should just go
				494	* BUG_ON(!list_empty).
				495	*
				496	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				497	* take an address_space, not an inode. And it should be called
				498	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				499	* queued up.
				500	*
				501	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				502	* list if it is already on a list. Because if the buffer is on a list,
				503	* it must already be on the right one. If not, the filesystem is being
				504	* silly. This will save a ton of locking. But first we have to ensure
				505	* that buffers are taken off the old inode's list when they are freed
				506	* (presumably in truncate). That requires careful auditing of all
				507	* filesystems (do it inside bforget()). It could also be done by bringing
				508	* b_inode back.
				509	*/
				510
				511	/*
				512	* The buffer's backing address_space's private_lock must be held
				513	*/
Thomas Petazzoni	dbacefc	2008-07-29 22:33:47 -0700	[diff] [blame]	514	static void __remove_assoc_queue(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	515	{
				516	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	517	WARN_ON(!bh->b_assoc_map);
				518	if (buffer_write_io_error(bh))
				519	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				520	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	521	}
				522
				523	int inode_has_buffers(struct inode *inode)
				524	{
				525	return !list_empty(&inode->i_data.private_list);
				526	}
				527
				528	/*
				529	* osync is designed to support O_SYNC io. It waits synchronously for
				530	* all already-submitted IO to complete, but does not queue any new
				531	* writes to the disk.
				532	*
				533	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				534	* you dirty the buffers, and then use osync_inode_buffers to wait for
				535	* completion. Any other dirty buffers which are not yet queued for
				536	* write will not be flushed to disk by the osync.
				537	*/
				538	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				539	{
				540	struct buffer_head *bh;
				541	struct list_head *p;
				542	int err = 0;
				543
				544	spin_lock(lock);
				545	repeat:
				546	list_for_each_prev(p, list) {
				547	bh = BH_ENTRY(p);
				548	if (buffer_locked(bh)) {
				549	get_bh(bh);
				550	spin_unlock(lock);
				551	wait_on_buffer(bh);
				552	if (!buffer_uptodate(bh))
				553	err = -EIO;
				554	brelse(bh);
				555	spin_lock(lock);
				556	goto repeat;
				557	}
				558	}
				559	spin_unlock(lock);
				560	return err;
				561	}
				562
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	563	static void do_thaw_all(struct work_struct *work)
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	564	{
Al Viro	6754af6	2010-03-22 20:09:33 -0400	[diff] [blame]	565	struct super_block sb, n;
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	566	char b[BDEVNAME_SIZE];
				567
				568	spin_lock(&sb_lock);
Al Viro	6754af6	2010-03-22 20:09:33 -0400	[diff] [blame]	569	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
Al Viro	551de6f	2010-03-22 19:36:35 -0400	[diff] [blame]	570	if (list_empty(&sb->s_instances))
				571	continue;
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	572	sb->s_count++;
				573	spin_unlock(&sb_lock);
				574	down_read(&sb->s_umount);
				575	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
				576	printk(KERN_WARNING "Emergency Thaw on %s\n",
				577	bdevname(sb->s_bdev, b));
				578	up_read(&sb->s_umount);
				579	spin_lock(&sb_lock);
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	580	}
				581	spin_unlock(&sb_lock);
Jens Axboe	053c525	2009-04-08 13:44:08 +0200	[diff] [blame]	582	kfree(work);
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	583	printk(KERN_WARNING "Emergency Thaw complete\n");
				584	}
				585
				586	/**
				587	* emergency_thaw_all -- forcibly thaw every frozen filesystem
				588	*
				589	* Used for emergency unfreeze of all filesystems via SysRq
				590	*/
				591	void emergency_thaw_all(void)
				592	{
Jens Axboe	053c525	2009-04-08 13:44:08 +0200	[diff] [blame]	593	struct work_struct *work;
				594
				595	work = kmalloc(sizeof(*work), GFP_ATOMIC);
				596	if (work) {
				597	INIT_WORK(work, do_thaw_all);
				598	schedule_work(work);
				599	}
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	600	}
				601
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	602	/**
Randy Dunlap	78a4a50	2008-02-29 22:02:31 -0800	[diff] [blame]	603	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	604	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605	*
				606	* Starts I/O against the buffers at mapping->private_list, and waits upon
				607	* that I/O.
				608	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	609	* Basically, this is a convenience function for fsync().
				610	* @mapping is a file or directory which needs those buffers to be written for
				611	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	612	*/
				613	int sync_mapping_buffers(struct address_space *mapping)
				614	{
				615	struct address_space *buffer_mapping = mapping->assoc_mapping;
				616
				617	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				618	return 0;
				619
				620	return fsync_buffers_list(&buffer_mapping->private_lock,
				621	&mapping->private_list);
				622	}
				623	EXPORT_SYMBOL(sync_mapping_buffers);
				624
				625	/*
				626	* Called when we've recently written block `bblock', and it is known that
				627	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				628	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				629	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				630	*/
				631	void write_boundary_block(struct block_device *bdev,
				632	sector_t bblock, unsigned blocksize)
				633	{
				634	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				635	if (bh) {
				636	if (buffer_dirty(bh))
				637	ll_rw_block(WRITE, 1, &bh);
				638	put_bh(bh);
				639	}
				640	}
				641
				642	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				643	{
				644	struct address_space *mapping = inode->i_mapping;
				645	struct address_space *buffer_mapping = bh->b_page->mapping;
				646
				647	mark_buffer_dirty(bh);
				648	if (!mapping->assoc_mapping) {
				649	mapping->assoc_mapping = buffer_mapping;
				650	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	651	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	652	}
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	653	if (!bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	654	spin_lock(&buffer_mapping->private_lock);
				655	list_move_tail(&bh->b_assoc_buffers,
				656	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	657	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	658	spin_unlock(&buffer_mapping->private_lock);
				659	}
				660	}
				661	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				662
				663	/*
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	664	* Mark the page dirty, and set it dirty in the radix tree, and mark the inode
				665	* dirty.
				666	*
				667	* If warn is true, then emit a warning if the page is not uptodate and has
				668	* not been truncated.
				669	*/
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	670	static void __set_page_dirty(struct page *page,
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	671	struct address_space *mapping, int warn)
				672	{
Nick Piggin	19fd623	2008-07-25 19:45:32 -0700	[diff] [blame]	673	spin_lock_irq(&mapping->tree_lock);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	674	if (page->mapping) { /* Race with truncate? */
				675	WARN_ON_ONCE(warn && !PageUptodate(page));
Edward Shishkin	e3a7cca	2009-03-31 15:19:39 -0700	[diff] [blame]	676	account_page_dirtied(page, mapping);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	677	radix_tree_tag_set(&mapping->page_tree,
				678	page_index(page), PAGECACHE_TAG_DIRTY);
				679	}
Nick Piggin	19fd623	2008-07-25 19:45:32 -0700	[diff] [blame]	680	spin_unlock_irq(&mapping->tree_lock);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	681	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	682	}
				683
				684	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	685	* Add a page to the dirty page list.
				686	*
				687	* It is a sad fact of life that this function is called from several places
				688	* deeply under spinlocking. It may not sleep.
				689	*
				690	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				691	* dirty-state coherency between the page and the buffers. It the page does
				692	* not have buffers then when they are later attached they will all be set
				693	* dirty.
				694	*
				695	* The buffers are dirtied before the page is dirtied. There's a small race
				696	* window in which a writepage caller may see the page cleanness but not the
				697	* buffer dirtiness. That's fine. If this code were to set the page dirty
				698	* before the buffers, a concurrent writepage caller could clear the page dirty
				699	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				700	* page on the dirty page list.
				701	*
				702	* We use private_lock to lock against try_to_free_buffers while using the
				703	* page's buffer list. Also use this to protect against clean buffers being
				704	* added to the page after it was set dirty.
				705	*
				706	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				707	* address_space though.
				708	*/
				709	int __set_page_dirty_buffers(struct page *page)
				710	{
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	711	int newly_dirty;
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	712	struct address_space *mapping = page_mapping(page);
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	713
				714	if (unlikely(!mapping))
				715	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	716
				717	spin_lock(&mapping->private_lock);
				718	if (page_has_buffers(page)) {
				719	struct buffer_head *head = page_buffers(page);
				720	struct buffer_head *bh = head;
				721
				722	do {
				723	set_buffer_dirty(bh);
				724	bh = bh->b_this_page;
				725	} while (bh != head);
				726	}
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	727	newly_dirty = !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	728	spin_unlock(&mapping->private_lock);
				729
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	730	if (newly_dirty)
				731	__set_page_dirty(page, mapping, 1);
				732	return newly_dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	733	}
				734	EXPORT_SYMBOL(__set_page_dirty_buffers);
				735
				736	/*
				737	* Write out and wait upon a list of buffers.
				738	*
				739	* We have conflicting pressures: we want to make sure that all
				740	* initially dirty buffers get waited on, but that any subsequently
				741	* dirtied buffers don't. After all, we don't want fsync to last
				742	* forever if somebody is actively writing to the file.
				743	*
				744	* Do this in two main stages: first we copy dirty buffers to a
				745	* temporary inode list, queueing the writes as we go. Then we clean
				746	* up, waiting for those writes to complete.
				747	*
				748	* During this second stage, any subsequent updates to the file may end
				749	* up refiling the buffer on the original inode's dirty list again, so
				750	* there is a chance we will end up with a buffer queued for write but
				751	* not yet completed on that list. So, as a final cleanup we go through
				752	* the osync code to catch these locked, dirty buffers without requeuing
				753	* any newly dirty buffers for write.
				754	*/
				755	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				756	{
				757	struct buffer_head *bh;
				758	struct list_head tmp;
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	759	struct address_space mapping, prev_mapping = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	760	int err = 0, err2;
				761
				762	INIT_LIST_HEAD(&tmp);
				763
				764	spin_lock(lock);
				765	while (!list_empty(list)) {
				766	bh = BH_ENTRY(list->next);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	767	mapping = bh->b_assoc_map;
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	768	__remove_assoc_queue(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	769	/* Avoid race with mark_buffer_dirty_inode() which does
				770	* a lockless check and we rely on seeing the dirty bit */
				771	smp_mb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	772	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				773	list_add(&bh->b_assoc_buffers, &tmp);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	774	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	775	if (buffer_dirty(bh)) {
				776	get_bh(bh);
				777	spin_unlock(lock);
				778	/*
				779	* Ensure any pending I/O completes so that
				780	* ll_rw_block() actually writes the current
				781	* contents - it is a noop if I/O is still in
				782	* flight on potentially older contents.
				783	*/
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	784	ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
				785
				786	/*
				787	* Kick off IO for the previous mapping. Note
				788	* that we will not run the very last mapping,
				789	* wait_on_buffer() will do that for us
				790	* through sync_buffer().
				791	*/
				792	if (prev_mapping && prev_mapping != mapping)
				793	blk_run_address_space(prev_mapping);
				794	prev_mapping = mapping;
				795
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	796	brelse(bh);
				797	spin_lock(lock);
				798	}
				799	}
				800	}
				801
				802	while (!list_empty(&tmp)) {
				803	bh = BH_ENTRY(tmp.prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	804	get_bh(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	805	mapping = bh->b_assoc_map;
				806	__remove_assoc_queue(bh);
				807	/* Avoid race with mark_buffer_dirty_inode() which does
				808	* a lockless check and we rely on seeing the dirty bit */
				809	smp_mb();
				810	if (buffer_dirty(bh)) {
				811	list_add(&bh->b_assoc_buffers,
Jan Kara	e389229	2008-03-04 14:28:33 -0800	[diff] [blame]	812	&mapping->private_list);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	813	bh->b_assoc_map = mapping;
				814	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	815	spin_unlock(lock);
				816	wait_on_buffer(bh);
				817	if (!buffer_uptodate(bh))
				818	err = -EIO;
				819	brelse(bh);
				820	spin_lock(lock);
				821	}
				822
				823	spin_unlock(lock);
				824	err2 = osync_buffers_list(lock, list);
				825	if (err)
				826	return err;
				827	else
				828	return err2;
				829	}
				830
				831	/*
				832	* Invalidate any and all dirty buffers on a given inode. We are
				833	* probably unmounting the fs, but that doesn't mean we have already
				834	* done a sync(). Just drop the buffers from the inode list.
				835	*
				836	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				837	* assumes that all the buffers are against the blockdev. Not true
				838	* for reiserfs.
				839	*/
				840	void invalidate_inode_buffers(struct inode *inode)
				841	{
				842	if (inode_has_buffers(inode)) {
				843	struct address_space *mapping = &inode->i_data;
				844	struct list_head *list = &mapping->private_list;
				845	struct address_space *buffer_mapping = mapping->assoc_mapping;
				846
				847	spin_lock(&buffer_mapping->private_lock);
				848	while (!list_empty(list))
				849	__remove_assoc_queue(BH_ENTRY(list->next));
				850	spin_unlock(&buffer_mapping->private_lock);
				851	}
				852	}
Jan Kara	52b19ac	2008-09-23 18:24:08 +0200	[diff] [blame]	853	EXPORT_SYMBOL(invalidate_inode_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	854
				855	/*
				856	* Remove any clean buffers from the inode's buffer list. This is called
				857	* when we're trying to free the inode itself. Those buffers can pin it.
				858	*
				859	* Returns true if all buffers were removed.
				860	*/
				861	int remove_inode_buffers(struct inode *inode)
				862	{
				863	int ret = 1;
				864
				865	if (inode_has_buffers(inode)) {
				866	struct address_space *mapping = &inode->i_data;
				867	struct list_head *list = &mapping->private_list;
				868	struct address_space *buffer_mapping = mapping->assoc_mapping;
				869
				870	spin_lock(&buffer_mapping->private_lock);
				871	while (!list_empty(list)) {
				872	struct buffer_head *bh = BH_ENTRY(list->next);
				873	if (buffer_dirty(bh)) {
				874	ret = 0;
				875	break;
				876	}
				877	__remove_assoc_queue(bh);
				878	}
				879	spin_unlock(&buffer_mapping->private_lock);
				880	}
				881	return ret;
				882	}
				883
				884	/*
				885	* Create the appropriate buffers when given a page for data area and
				886	* the size of each buffer.. Use the bh->b_this_page linked list to
				887	* follow the buffers created. Return NULL if unable to create more
				888	* buffers.
				889	*
				890	* The retry flag is used to differentiate async IO (paging, swapping)
				891	* which may not fail from ordinary buffer allocations.
				892	*/
				893	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				894	int retry)
				895	{
				896	struct buffer_head bh, head;
				897	long offset;
				898
				899	try_again:
				900	head = NULL;
				901	offset = PAGE_SIZE;
				902	while ((offset -= size) >= 0) {
				903	bh = alloc_buffer_head(GFP_NOFS);
				904	if (!bh)
				905	goto no_grow;
				906
				907	bh->b_bdev = NULL;
				908	bh->b_this_page = head;
				909	bh->b_blocknr = -1;
				910	head = bh;
				911
				912	bh->b_state = 0;
				913	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	914	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	915	bh->b_size = size;
				916
				917	/* Link the buffer to its page */
				918	set_bh_page(bh, page, offset);
				919
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	920	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	921	}
				922	return head;
				923	/*
				924	* In case anything failed, we just free everything we got.
				925	*/
				926	no_grow:
				927	if (head) {
				928	do {
				929	bh = head;
				930	head = head->b_this_page;
				931	free_buffer_head(bh);
				932	} while (head);
				933	}
				934
				935	/*
				936	* Return failure for non-async IO requests. Async IO requests
				937	* are not allowed to fail, so we have to wait until buffer heads
				938	* become available. But we don't want tasks sleeping with
				939	* partially complete buffers, so all were released above.
				940	*/
				941	if (!retry)
				942	return NULL;
				943
				944	/* We're _really_ low on memory. Now we just
				945	* wait for old buffer heads to become free due to
				946	* finishing IO. Since this is an async request and
				947	* the reserve list is empty, we're sure there are
				948	* async buffer heads in use.
				949	*/
				950	free_more_memory();
				951	goto try_again;
				952	}
				953	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				954
				955	static inline void
				956	link_dev_buffers(struct page page, struct buffer_head head)
				957	{
				958	struct buffer_head bh, tail;
				959
				960	bh = head;
				961	do {
				962	tail = bh;
				963	bh = bh->b_this_page;
				964	} while (bh);
				965	tail->b_this_page = head;
				966	attach_page_buffers(page, head);
				967	}
				968
				969	/*
				970	* Initialise the state of a blockdev page's buffers.
				971	*/
				972	static void
				973	init_page_buffers(struct page page, struct block_device bdev,
				974	sector_t block, int size)
				975	{
				976	struct buffer_head *head = page_buffers(page);
				977	struct buffer_head *bh = head;
				978	int uptodate = PageUptodate(page);
				979
				980	do {
				981	if (!buffer_mapped(bh)) {
				982	init_buffer(bh, NULL, NULL);
				983	bh->b_bdev = bdev;
				984	bh->b_blocknr = block;
				985	if (uptodate)
				986	set_buffer_uptodate(bh);
				987	set_buffer_mapped(bh);
				988	}
				989	block++;
				990	bh = bh->b_this_page;
				991	} while (bh != head);
				992	}
				993
				994	/*
				995	* Create the page-cache page that contains the requested block.
				996	*
				997	* This is user purely for blockdev mappings.
				998	*/
				999	static struct page *
				1000	grow_dev_page(struct block_device *bdev, sector_t block,
				1001	pgoff_t index, int size)
				1002	{
				1003	struct inode *inode = bdev->bd_inode;
				1004	struct page *page;
				1005	struct buffer_head *bh;
				1006
Christoph Lameter	ea12589	2007-05-16 22:11:21 -0700	[diff] [blame]	1007	page = find_or_create_page(inode->i_mapping, index,
Mel Gorman	769848c	2007-07-17 04:03:05 -0700	[diff] [blame]	1008	(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)\|__GFP_MOVABLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1009	if (!page)
				1010	return NULL;
				1011
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1012	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1013
				1014	if (page_has_buffers(page)) {
				1015	bh = page_buffers(page);
				1016	if (bh->b_size == size) {
				1017	init_page_buffers(page, bdev, block, size);
				1018	return page;
				1019	}
				1020	if (!try_to_free_buffers(page))
				1021	goto failed;
				1022	}
				1023
				1024	/*
				1025	* Allocate some buffers for this page
				1026	*/
				1027	bh = alloc_page_buffers(page, size, 0);
				1028	if (!bh)
				1029	goto failed;
				1030
				1031	/*
				1032	* Link the page to the buffers and initialise them. Take the
				1033	* lock to be atomic wrt __find_get_block(), which does not
				1034	* run under the page lock.
				1035	*/
				1036	spin_lock(&inode->i_mapping->private_lock);
				1037	link_dev_buffers(page, bh);
				1038	init_page_buffers(page, bdev, block, size);
				1039	spin_unlock(&inode->i_mapping->private_lock);
				1040	return page;
				1041
				1042	failed:
				1043	BUG();
				1044	unlock_page(page);
				1045	page_cache_release(page);
				1046	return NULL;
				1047	}
				1048
				1049	/*
				1050	* Create buffers for the specified block device block's page. If
				1051	* that page was dirty, the buffers are set dirty also.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1052	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1053	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1054	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1055	{
				1056	struct page *page;
				1057	pgoff_t index;
				1058	int sizebits;
				1059
				1060	sizebits = -1;
				1061	do {
				1062	sizebits++;
				1063	} while ((size << sizebits) < PAGE_SIZE);
				1064
				1065	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1066
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1067	/*
				1068	* Check for a block which wants to lie outside our maximum possible
				1069	* pagecache index. (this comparison is done using sector_t types).
				1070	*/
				1071	if (unlikely(index != block >> sizebits)) {
				1072	char b[BDEVNAME_SIZE];
				1073
				1074	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1075	"device %s\n",
Harvey Harrison	8e24eea	2008-04-30 00:55:09 -0700	[diff] [blame]	1076	__func__, (unsigned long long)block,
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1077	bdevname(bdev, b));
				1078	return -EIO;
				1079	}
				1080	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1081	/* Create a page with the proper size buffers.. */
				1082	page = grow_dev_page(bdev, block, index, size);
				1083	if (!page)
				1084	return 0;
				1085	unlock_page(page);
				1086	page_cache_release(page);
				1087	return 1;
				1088	}
				1089
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1090	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1091	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1092	{
				1093	/* Size must be multiple of hard sectorsize */
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1094	if (unlikely(size & (bdev_logical_block_size(bdev)-1) \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1095	(size < 512 \|\| size > PAGE_SIZE))) {
				1096	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1097	size);
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1098	printk(KERN_ERR "logical block size: %d\n",
				1099	bdev_logical_block_size(bdev));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1100
				1101	dump_stack();
				1102	return NULL;
				1103	}
				1104
				1105	for (;;) {
				1106	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1107	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1108
				1109	bh = __find_get_block(bdev, block, size);
				1110	if (bh)
				1111	return bh;
				1112
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1113	ret = grow_buffers(bdev, block, size);
				1114	if (ret < 0)
				1115	return NULL;
				1116	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1117	free_more_memory();
				1118	}
				1119	}
				1120
				1121	/*
				1122	* The relationship between dirty buffers and dirty pages:
				1123	*
				1124	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1125	* the page is tagged dirty in its radix tree.
				1126	*
				1127	* At all times, the dirtiness of the buffers represents the dirtiness of
				1128	* subsections of the page. If the page has buffers, the page dirty bit is
				1129	* merely a hint about the true dirty state.
				1130	*
				1131	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1132	* (if the page has buffers).
				1133	*
				1134	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1135	* buffers are not.
				1136	*
				1137	* Also. When blockdev buffers are explicitly read with bread(), they
				1138	* individually become uptodate. But their backing page remains not
				1139	* uptodate - even if all of its buffers are uptodate. A subsequent
				1140	* block_read_full_page() against that page will discover all the uptodate
				1141	* buffers, will set the page uptodate and will perform no I/O.
				1142	*/
				1143
				1144	/**
				1145	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1146	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1147	*
				1148	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1149	* backing page dirty, then tag the page as dirty in its address_space's radix
				1150	* tree and then attach the address_space's inode to its superblock's dirty
				1151	* inode list.
				1152	*
				1153	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1154	* mapping->tree_lock and the global inode_lock.
				1155	*/
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	1156	void mark_buffer_dirty(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1157	{
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	1158	WARN_ON_ONCE(!buffer_uptodate(bh));
Linus Torvalds	1be62dc	2008-04-04 14:38:17 -0700	[diff] [blame]	1159
				1160	/*
				1161	* Very carefully optimize the it-is-already-dirty case.
				1162	*
				1163	* Don't let the final "is it dirty" escape to before we
				1164	* perhaps modified the buffer.
				1165	*/
				1166	if (buffer_dirty(bh)) {
				1167	smp_mb();
				1168	if (buffer_dirty(bh))
				1169	return;
				1170	}
				1171
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1172	if (!test_set_buffer_dirty(bh)) {
				1173	struct page *page = bh->b_page;
Linus Torvalds	8e9d78e	2009-08-21 17:40:08 -0700	[diff] [blame]	1174	if (!TestSetPageDirty(page)) {
				1175	struct address_space *mapping = page_mapping(page);
				1176	if (mapping)
				1177	__set_page_dirty(page, mapping, 0);
				1178	}
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1179	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1180	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1181	EXPORT_SYMBOL(mark_buffer_dirty);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1182
				1183	/*
				1184	* Decrement a buffer_head's reference count. If all buffers against a page
				1185	* have zero reference count, are clean and unlocked, and if the page is clean
				1186	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1187	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1188	* a page but it ends up not being freed, and buffers may later be reattached).
				1189	*/
				1190	void __brelse(struct buffer_head * buf)
				1191	{
				1192	if (atomic_read(&buf->b_count)) {
				1193	put_bh(buf);
				1194	return;
				1195	}
Arjan van de Ven	5c752ad	2008-07-25 19:45:40 -0700	[diff] [blame]	1196	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1197	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1198	EXPORT_SYMBOL(__brelse);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1199
				1200	/*
				1201	* bforget() is like brelse(), except it discards any
				1202	* potentially dirty data.
				1203	*/
				1204	void __bforget(struct buffer_head *bh)
				1205	{
				1206	clear_buffer_dirty(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	1207	if (bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1208	struct address_space *buffer_mapping = bh->b_page->mapping;
				1209
				1210	spin_lock(&buffer_mapping->private_lock);
				1211	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1212	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1213	spin_unlock(&buffer_mapping->private_lock);
				1214	}
				1215	__brelse(bh);
				1216	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1217	EXPORT_SYMBOL(__bforget);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1218
				1219	static struct buffer_head __bread_slow(struct buffer_head bh)
				1220	{
				1221	lock_buffer(bh);
				1222	if (buffer_uptodate(bh)) {
				1223	unlock_buffer(bh);
				1224	return bh;
				1225	} else {
				1226	get_bh(bh);
				1227	bh->b_end_io = end_buffer_read_sync;
				1228	submit_bh(READ, bh);
				1229	wait_on_buffer(bh);
				1230	if (buffer_uptodate(bh))
				1231	return bh;
				1232	}
				1233	brelse(bh);
				1234	return NULL;
				1235	}
				1236
				1237	/*
				1238	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1239	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1240	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1241	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1242	* CPU's LRUs at the same time.
				1243	*
				1244	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1245	* sb_find_get_block().
				1246	*
				1247	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1248	* a local interrupt disable for that.
				1249	*/
				1250
				1251	#define BH_LRU_SIZE 8
				1252
				1253	struct bh_lru {
				1254	struct buffer_head *bhs[BH_LRU_SIZE];
				1255	};
				1256
				1257	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1258
				1259	#ifdef CONFIG_SMP
				1260	#define bh_lru_lock() local_irq_disable()
				1261	#define bh_lru_unlock() local_irq_enable()
				1262	#else
				1263	#define bh_lru_lock() preempt_disable()
				1264	#define bh_lru_unlock() preempt_enable()
				1265	#endif
				1266
				1267	static inline void check_irqs_on(void)
				1268	{
				1269	#ifdef irqs_disabled
				1270	BUG_ON(irqs_disabled());
				1271	#endif
				1272	}
				1273
				1274	/*
				1275	* The LRU management algorithm is dopey-but-simple. Sorry.
				1276	*/
				1277	static void bh_lru_install(struct buffer_head *bh)
				1278	{
				1279	struct buffer_head *evictee = NULL;
				1280	struct bh_lru *lru;
				1281
				1282	check_irqs_on();
				1283	bh_lru_lock();
				1284	lru = &__get_cpu_var(bh_lrus);
				1285	if (lru->bhs[0] != bh) {
				1286	struct buffer_head *bhs[BH_LRU_SIZE];
				1287	int in;
				1288	int out = 0;
				1289
				1290	get_bh(bh);
				1291	bhs[out++] = bh;
				1292	for (in = 0; in < BH_LRU_SIZE; in++) {
				1293	struct buffer_head *bh2 = lru->bhs[in];
				1294
				1295	if (bh2 == bh) {
				1296	__brelse(bh2);
				1297	} else {
				1298	if (out >= BH_LRU_SIZE) {
				1299	BUG_ON(evictee != NULL);
				1300	evictee = bh2;
				1301	} else {
				1302	bhs[out++] = bh2;
				1303	}
				1304	}
				1305	}
				1306	while (out < BH_LRU_SIZE)
				1307	bhs[out++] = NULL;
				1308	memcpy(lru->bhs, bhs, sizeof(bhs));
				1309	}
				1310	bh_lru_unlock();
				1311
				1312	if (evictee)
				1313	__brelse(evictee);
				1314	}
				1315
				1316	/*
				1317	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1318	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1319	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1320	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1321	{
				1322	struct buffer_head *ret = NULL;
				1323	struct bh_lru *lru;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1324	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1325
				1326	check_irqs_on();
				1327	bh_lru_lock();
				1328	lru = &__get_cpu_var(bh_lrus);
				1329	for (i = 0; i < BH_LRU_SIZE; i++) {
				1330	struct buffer_head *bh = lru->bhs[i];
				1331
				1332	if (bh && bh->b_bdev == bdev &&
				1333	bh->b_blocknr == block && bh->b_size == size) {
				1334	if (i) {
				1335	while (i) {
				1336	lru->bhs[i] = lru->bhs[i - 1];
				1337	i--;
				1338	}
				1339	lru->bhs[0] = bh;
				1340	}
				1341	get_bh(bh);
				1342	ret = bh;
				1343	break;
				1344	}
				1345	}
				1346	bh_lru_unlock();
				1347	return ret;
				1348	}
				1349
				1350	/*
				1351	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1352	* it in the LRU and mark it as accessed. If it is not present then return
				1353	* NULL
				1354	*/
				1355	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1356	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1357	{
				1358	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1359
				1360	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1361	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1362	if (bh)
				1363	bh_lru_install(bh);
				1364	}
				1365	if (bh)
				1366	touch_buffer(bh);
				1367	return bh;
				1368	}
				1369	EXPORT_SYMBOL(__find_get_block);
				1370
				1371	/*
				1372	* __getblk will locate (and, if necessary, create) the buffer_head
				1373	* which corresponds to the passed block_device, block and size. The
				1374	* returned buffer has its reference count incremented.
				1375	*
				1376	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1377	* illegal block number, __getblk() will happily return a buffer_head
				1378	* which represents the non-existent block. Very weird.
				1379	*
				1380	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1381	* attempt is failing. FIXME, perhaps?
				1382	*/
				1383	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1384	__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1385	{
				1386	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1387
				1388	might_sleep();
				1389	if (bh == NULL)
				1390	bh = __getblk_slow(bdev, block, size);
				1391	return bh;
				1392	}
				1393	EXPORT_SYMBOL(__getblk);
				1394
				1395	/*
				1396	* Do async read-ahead on a buffer..
				1397	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1398	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1399	{
				1400	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1401	if (likely(bh)) {
				1402	ll_rw_block(READA, 1, &bh);
				1403	brelse(bh);
				1404	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1405	}
				1406	EXPORT_SYMBOL(__breadahead);
				1407
				1408	/**
				1409	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1410	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1411	* @block: number of block
				1412	* @size: size (in bytes) to read
				1413	*
				1414	* Reads a specified block, and returns buffer head that contains it.
				1415	* It returns NULL if the block was unreadable.
				1416	*/
				1417	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1418	__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1419	{
				1420	struct buffer_head *bh = __getblk(bdev, block, size);
				1421
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1422	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1423	bh = __bread_slow(bh);
				1424	return bh;
				1425	}
				1426	EXPORT_SYMBOL(__bread);
				1427
				1428	/*
				1429	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1430	* This doesn't race because it runs in each cpu either in irq
				1431	* or with preempt disabled.
				1432	*/
				1433	static void invalidate_bh_lru(void *arg)
				1434	{
				1435	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1436	int i;
				1437
				1438	for (i = 0; i < BH_LRU_SIZE; i++) {
				1439	brelse(b->bhs[i]);
				1440	b->bhs[i] = NULL;
				1441	}
				1442	put_cpu_var(bh_lrus);
				1443	}
				1444
Peter Zijlstra	f9a1439	2007-05-06 14:49:55 -0700	[diff] [blame]	1445	void invalidate_bh_lrus(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1446	{
Jens Axboe	15c8b6c	2008-05-09 09:39:44 +0200	[diff] [blame]	1447	on_each_cpu(invalidate_bh_lru, NULL, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1448	}
Nick Piggin	9db5579	2008-02-08 04:19:49 -0800	[diff] [blame]	1449	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1450
				1451	void set_bh_page(struct buffer_head *bh,
				1452	struct page *page, unsigned long offset)
				1453	{
				1454	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1455	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1456	if (PageHighMem(page))
				1457	/*
				1458	* This catches illegal uses and preserves the offset:
				1459	*/
				1460	bh->b_data = (char *)(0 + offset);
				1461	else
				1462	bh->b_data = page_address(page) + offset;
				1463	}
				1464	EXPORT_SYMBOL(set_bh_page);
				1465
				1466	/*
				1467	* Called when truncating a buffer on a page completely.
				1468	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1469	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1470	{
				1471	lock_buffer(bh);
				1472	clear_buffer_dirty(bh);
				1473	bh->b_bdev = NULL;
				1474	clear_buffer_mapped(bh);
				1475	clear_buffer_req(bh);
				1476	clear_buffer_new(bh);
				1477	clear_buffer_delay(bh);
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1478	clear_buffer_unwritten(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1479	unlock_buffer(bh);
				1480	}
				1481
				1482	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1483	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1484	*
				1485	* @page: the page which is affected
				1486	* @offset: the index of the truncation point
				1487	*
				1488	* block_invalidatepage() is called when all or part of the page has become
				1489	* invalidatedby a truncate operation.
				1490	*
				1491	* block_invalidatepage() does not have to release all buffers, but it must
				1492	* ensure that no dirty buffer is left outside @offset and that no I/O
				1493	* is underway against any of the blocks which are outside the truncation
				1494	* point. Because the caller is about to free (and possibly reuse) those
				1495	* blocks on-disk.
				1496	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1497	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1498	{
				1499	struct buffer_head head, bh, *next;
				1500	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1501
				1502	BUG_ON(!PageLocked(page));
				1503	if (!page_has_buffers(page))
				1504	goto out;
				1505
				1506	head = page_buffers(page);
				1507	bh = head;
				1508	do {
				1509	unsigned int next_off = curr_off + bh->b_size;
				1510	next = bh->b_this_page;
				1511
				1512	/*
				1513	* is this block fully invalidated?
				1514	*/
				1515	if (offset <= curr_off)
				1516	discard_buffer(bh);
				1517	curr_off = next_off;
				1518	bh = next;
				1519	} while (bh != head);
				1520
				1521	/*
				1522	* We release buffers only if the entire page is being invalidated.
				1523	* The get_block cached value has been unconditionally invalidated,
				1524	* so real IO is not possible anymore.
				1525	*/
				1526	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1527	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1528	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1529	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1530	}
				1531	EXPORT_SYMBOL(block_invalidatepage);
				1532
				1533	/*
				1534	* We attach and possibly dirty the buffers atomically wrt
				1535	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1536	* is already excluded via the page lock.
				1537	*/
				1538	void create_empty_buffers(struct page *page,
				1539	unsigned long blocksize, unsigned long b_state)
				1540	{
				1541	struct buffer_head bh, head, *tail;
				1542
				1543	head = alloc_page_buffers(page, blocksize, 1);
				1544	bh = head;
				1545	do {
				1546	bh->b_state \|= b_state;
				1547	tail = bh;
				1548	bh = bh->b_this_page;
				1549	} while (bh);
				1550	tail->b_this_page = head;
				1551
				1552	spin_lock(&page->mapping->private_lock);
				1553	if (PageUptodate(page) \|\| PageDirty(page)) {
				1554	bh = head;
				1555	do {
				1556	if (PageDirty(page))
				1557	set_buffer_dirty(bh);
				1558	if (PageUptodate(page))
				1559	set_buffer_uptodate(bh);
				1560	bh = bh->b_this_page;
				1561	} while (bh != head);
				1562	}
				1563	attach_page_buffers(page, head);
				1564	spin_unlock(&page->mapping->private_lock);
				1565	}
				1566	EXPORT_SYMBOL(create_empty_buffers);
				1567
				1568	/*
				1569	* We are taking a block for data and we don't want any output from any
				1570	* buffer-cache aliases starting from return from that function and
				1571	* until the moment when something will explicitly mark the buffer
				1572	* dirty (hopefully that will not happen until we will free that block ;-)
				1573	* We don't even need to mark it not-uptodate - nobody can expect
				1574	* anything from a newly allocated buffer anyway. We used to used
				1575	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1576	* don't want to mark the alias unmapped, for example - it would confuse
				1577	* anyone who might pick it with bread() afterwards...
				1578	*
				1579	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1580	* be writeout I/O going on against recently-freed buffers. We don't
				1581	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1582	* only if we really need to. That happens here.
				1583	*/
				1584	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1585	{
				1586	struct buffer_head *old_bh;
				1587
				1588	might_sleep();
				1589
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1590	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1591	if (old_bh) {
				1592	clear_buffer_dirty(old_bh);
				1593	wait_on_buffer(old_bh);
				1594	clear_buffer_req(old_bh);
				1595	__brelse(old_bh);
				1596	}
				1597	}
				1598	EXPORT_SYMBOL(unmap_underlying_metadata);
				1599
				1600	/*
				1601	* NOTE! All mapped/uptodate combinations are valid:
				1602	*
				1603	* Mapped Uptodate Meaning
				1604	*
				1605	* No No "unknown" - must do get_block()
				1606	* No Yes "hole" - zero-filled
				1607	* Yes No "allocated" - allocated on disk, not read in
				1608	* Yes Yes "valid" - allocated and up-to-date in memory.
				1609	*
				1610	* "Dirty" is valid only with the last case (mapped+uptodate).
				1611	*/
				1612
				1613	/*
				1614	* While block_write_full_page is writing back the dirty buffers under
				1615	* the page lock, whoever dirtied the buffers may decide to clean them
				1616	* again at any time. We handle that by only looking at the buffer
				1617	* state inside lock_buffer().
				1618	*
				1619	* If block_write_full_page() is called for regular writeback
				1620	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1621	* locked buffer. This only can happen if someone has written the buffer
				1622	* directly, with submit_bh(). At the address_space level PageWriteback
				1623	* prevents this contention from occurring.
Theodore Ts'o	6e34eed	2009-04-07 18:12:43 -0400	[diff] [blame]	1624	*
				1625	* If block_write_full_page() is called with wbc->sync_mode ==
				1626	* WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
				1627	* causes the writes to be flagged as synchronous writes, but the
				1628	* block device queue will NOT be unplugged, since usually many pages
				1629	* will be pushed to the out before the higher-level caller actually
				1630	* waits for the writes to be completed. The various wait functions,
				1631	* such as wait_on_writeback_range() will ultimately call sync_page()
				1632	* which will ultimately call blk_run_backing_dev(), which will end up
				1633	* unplugging the device queue.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1634	*/
				1635	static int __block_write_full_page(struct inode inode, struct page page,
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1636	get_block_t get_block, struct writeback_control wbc,
				1637	bh_end_io_t *handler)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1638	{
				1639	int err;
				1640	sector_t block;
				1641	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1642	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1643	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1644	int nr_underway = 0;
Theodore Ts'o	6e34eed	2009-04-07 18:12:43 -0400	[diff] [blame]	1645	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
				1646	WRITE_SYNC_PLUG : WRITE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1647
				1648	BUG_ON(!PageLocked(page));
				1649
				1650	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1651
				1652	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1653	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1654	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1655	}
				1656
				1657	/*
				1658	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1659	* here, and the (potentially unmapped) buffers may become dirty at
				1660	* any time. If a buffer becomes dirty here after we've inspected it
				1661	* then we just miss that fact, and the page stays dirty.
				1662	*
				1663	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1664	* handle that here by just cleaning them.
				1665	*/
				1666
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1667	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1668	head = page_buffers(page);
				1669	bh = head;
				1670
				1671	/*
				1672	* Get all the dirty buffers mapped to disk addresses and
				1673	* handle any aliases from the underlying blockdev's mapping.
				1674	*/
				1675	do {
				1676	if (block > last_block) {
				1677	/*
				1678	* mapped buffers outside i_size will occur, because
				1679	* this page can be outside i_size when there is a
				1680	* truncate in progress.
				1681	*/
				1682	/*
				1683	* The buffer was zeroed by block_write_full_page()
				1684	*/
				1685	clear_buffer_dirty(bh);
				1686	set_buffer_uptodate(bh);
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1687	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
				1688	buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1689	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1690	err = get_block(inode, block, bh, 1);
				1691	if (err)
				1692	goto recover;
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1693	clear_buffer_delay(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1694	if (buffer_new(bh)) {
				1695	/* blockdev mappings never come here */
				1696	clear_buffer_new(bh);
				1697	unmap_underlying_metadata(bh->b_bdev,
				1698	bh->b_blocknr);
				1699	}
				1700	}
				1701	bh = bh->b_this_page;
				1702	block++;
				1703	} while (bh != head);
				1704
				1705	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1706	if (!buffer_mapped(bh))
				1707	continue;
				1708	/*
				1709	* If it's a fully non-blocking write attempt and we cannot
				1710	* lock the buffer then redirty the page. Note that this can
Jens Axboe	5b0830c	2009-09-23 19:37:09 +0200	[diff] [blame]	1711	* potentially cause a busy-wait loop from writeback threads
				1712	* and kswapd activity, but those code paths have their own
				1713	* higher-level throttling.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1714	*/
				1715	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1716	lock_buffer(bh);
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	1717	} else if (!trylock_buffer(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1718	redirty_page_for_writepage(wbc, page);
				1719	continue;
				1720	}
				1721	if (test_clear_buffer_dirty(bh)) {
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1722	mark_buffer_async_write_endio(bh, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1723	} else {
				1724	unlock_buffer(bh);
				1725	}
				1726	} while ((bh = bh->b_this_page) != head);
				1727
				1728	/*
				1729	* The page and its buffers are protected by PageWriteback(), so we can
				1730	* drop the bh refcounts early.
				1731	*/
				1732	BUG_ON(PageWriteback(page));
				1733	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1734
				1735	do {
				1736	struct buffer_head *next = bh->b_this_page;
				1737	if (buffer_async_write(bh)) {
Theodore Ts'o	a64c861	2009-03-27 22:14:10 -0400	[diff] [blame]	1738	submit_bh(write_op, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1739	nr_underway++;
				1740	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1741	bh = next;
				1742	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1743	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1744
				1745	err = 0;
				1746	done:
				1747	if (nr_underway == 0) {
				1748	/*
				1749	* The page was marked dirty, but the buffers were
				1750	* clean. Someone wrote them back by hand with
				1751	* ll_rw_block/submit_bh. A rare case.
				1752	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1753	end_page_writeback(page);
Nick Piggin	3d67f2d	2007-05-06 14:49:05 -0700	[diff] [blame]	1754
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1755	/*
				1756	* The page and buffer_heads can be released at any time from
				1757	* here on.
				1758	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1759	}
				1760	return err;
				1761
				1762	recover:
				1763	/*
				1764	* ENOSPC, or some other error. We may already have added some
				1765	* blocks to the file, so we need to write these out to avoid
				1766	* exposing stale data.
				1767	* The page is currently locked and not marked for writeback
				1768	*/
				1769	bh = head;
				1770	/* Recovery: lock and submit the mapped buffers */
				1771	do {
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1772	if (buffer_mapped(bh) && buffer_dirty(bh) &&
				1773	!buffer_delay(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1774	lock_buffer(bh);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1775	mark_buffer_async_write_endio(bh, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1776	} else {
				1777	/*
				1778	* The buffer may have been set dirty during
				1779	* attachment to a dirty page.
				1780	*/
				1781	clear_buffer_dirty(bh);
				1782	}
				1783	} while ((bh = bh->b_this_page) != head);
				1784	SetPageError(page);
				1785	BUG_ON(PageWriteback(page));
Andrew Morton	7e4c369	2007-05-08 00:23:27 -0700	[diff] [blame]	1786	mapping_set_error(page->mapping, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1787	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1788	do {
				1789	struct buffer_head *next = bh->b_this_page;
				1790	if (buffer_async_write(bh)) {
				1791	clear_buffer_dirty(bh);
Theodore Ts'o	a64c861	2009-03-27 22:14:10 -0400	[diff] [blame]	1792	submit_bh(write_op, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1793	nr_underway++;
				1794	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1795	bh = next;
				1796	} while (bh != head);
Nick Piggin	ffda9d3	2007-02-20 13:57:54 -0800	[diff] [blame]	1797	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1798	goto done;
				1799	}
				1800
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1801	/*
				1802	* If a page has any new buffers, zero them out here, and mark them uptodate
				1803	* and dirty so they'll be written out (in order to prevent uninitialised
				1804	* block data from leaking). And clear the new bit.
				1805	*/
				1806	void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
				1807	{
				1808	unsigned int block_start, block_end;
				1809	struct buffer_head head, bh;
				1810
				1811	BUG_ON(!PageLocked(page));
				1812	if (!page_has_buffers(page))
				1813	return;
				1814
				1815	bh = head = page_buffers(page);
				1816	block_start = 0;
				1817	do {
				1818	block_end = block_start + bh->b_size;
				1819
				1820	if (buffer_new(bh)) {
				1821	if (block_end > from && block_start < to) {
				1822	if (!PageUptodate(page)) {
				1823	unsigned start, size;
				1824
				1825	start = max(from, block_start);
				1826	size = min(to, block_end) - start;
				1827
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1828	zero_user(page, start, size);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1829	set_buffer_uptodate(bh);
				1830	}
				1831
				1832	clear_buffer_new(bh);
				1833	mark_buffer_dirty(bh);
				1834	}
				1835	}
				1836
				1837	block_start = block_end;
				1838	bh = bh->b_this_page;
				1839	} while (bh != head);
				1840	}
				1841	EXPORT_SYMBOL(page_zero_new_buffers);
				1842
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1843	static int __block_prepare_write(struct inode inode, struct page page,
				1844	unsigned from, unsigned to, get_block_t *get_block)
				1845	{
				1846	unsigned block_start, block_end;
				1847	sector_t block;
				1848	int err = 0;
				1849	unsigned blocksize, bbits;
				1850	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1851
				1852	BUG_ON(!PageLocked(page));
				1853	BUG_ON(from > PAGE_CACHE_SIZE);
				1854	BUG_ON(to > PAGE_CACHE_SIZE);
				1855	BUG_ON(from > to);
				1856
				1857	blocksize = 1 << inode->i_blkbits;
				1858	if (!page_has_buffers(page))
				1859	create_empty_buffers(page, blocksize, 0);
				1860	head = page_buffers(page);
				1861
				1862	bbits = inode->i_blkbits;
				1863	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1864
				1865	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1866	block++, block_start=block_end, bh = bh->b_this_page) {
				1867	block_end = block_start + blocksize;
				1868	if (block_end <= from \|\| block_start >= to) {
				1869	if (PageUptodate(page)) {
				1870	if (!buffer_uptodate(bh))
				1871	set_buffer_uptodate(bh);
				1872	}
				1873	continue;
				1874	}
				1875	if (buffer_new(bh))
				1876	clear_buffer_new(bh);
				1877	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1878	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1879	err = get_block(inode, block, bh, 1);
				1880	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1881	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1882	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1883	unmap_underlying_metadata(bh->b_bdev,
				1884	bh->b_blocknr);
				1885	if (PageUptodate(page)) {
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1886	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1887	set_buffer_uptodate(bh);
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1888	mark_buffer_dirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1889	continue;
				1890	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1891	if (block_end > to \|\| block_start < from)
				1892	zero_user_segments(page,
				1893	to, block_end,
				1894	block_start, from);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1895	continue;
				1896	}
				1897	}
				1898	if (PageUptodate(page)) {
				1899	if (!buffer_uptodate(bh))
				1900	set_buffer_uptodate(bh);
				1901	continue;
				1902	}
				1903	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1904	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1905	(block_start < from \|\| block_end > to)) {
				1906	ll_rw_block(READ, 1, &bh);
				1907	*wait_bh++=bh;
				1908	}
				1909	}
				1910	/*
				1911	* If we issued read requests - let them complete.
				1912	*/
				1913	while(wait_bh > wait) {
				1914	wait_on_buffer(*--wait_bh);
				1915	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1916	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1917	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1918	if (unlikely(err))
				1919	page_zero_new_buffers(page, from, to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1920	return err;
				1921	}
				1922
				1923	static int __block_commit_write(struct inode inode, struct page page,
				1924	unsigned from, unsigned to)
				1925	{
				1926	unsigned block_start, block_end;
				1927	int partial = 0;
				1928	unsigned blocksize;
				1929	struct buffer_head bh, head;
				1930
				1931	blocksize = 1 << inode->i_blkbits;
				1932
				1933	for(bh = head = page_buffers(page), block_start = 0;
				1934	bh != head \|\| !block_start;
				1935	block_start=block_end, bh = bh->b_this_page) {
				1936	block_end = block_start + blocksize;
				1937	if (block_end <= from \|\| block_start >= to) {
				1938	if (!buffer_uptodate(bh))
				1939	partial = 1;
				1940	} else {
				1941	set_buffer_uptodate(bh);
				1942	mark_buffer_dirty(bh);
				1943	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1944	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1945	}
				1946
				1947	/*
				1948	* If this is a partial write which happened to make all buffers
				1949	* uptodate then we can optimize away a bogus readpage() for
				1950	* the next read(). Here we 'discover' whether the page went
				1951	* uptodate as a result of this (potentially partial) write.
				1952	*/
				1953	if (!partial)
				1954	SetPageUptodate(page);
				1955	return 0;
				1956	}
				1957
				1958	/*
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1959	* block_write_begin takes care of the basic task of block allocation and
				1960	* bringing partial write blocks uptodate first.
				1961	*
				1962	* If *pagep is not NULL, then block_write_begin uses the locked page
				1963	* at *pagep rather than allocating its own. In this case, the page will
				1964	* not be unlocked or deallocated on failure.
				1965	*/
				1966	int block_write_begin(struct file file, struct address_space mapping,
				1967	loff_t pos, unsigned len, unsigned flags,
				1968	struct page pagep, void fsdata,
				1969	get_block_t *get_block)
				1970	{
				1971	struct inode *inode = mapping->host;
				1972	int status = 0;
				1973	struct page *page;
				1974	pgoff_t index;
				1975	unsigned start, end;
				1976	int ownpage = 0;
				1977
				1978	index = pos >> PAGE_CACHE_SHIFT;
				1979	start = pos & (PAGE_CACHE_SIZE - 1);
				1980	end = start + len;
				1981
				1982	page = *pagep;
				1983	if (page == NULL) {
				1984	ownpage = 1;
Nick Piggin	54566b2	2009-01-04 12:00:53 -0800	[diff] [blame]	1985	page = grab_cache_page_write_begin(mapping, index, flags);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1986	if (!page) {
				1987	status = -ENOMEM;
				1988	goto out;
				1989	}
				1990	*pagep = page;
				1991	} else
				1992	BUG_ON(!PageLocked(page));
				1993
				1994	status = __block_prepare_write(inode, page, start, end, get_block);
				1995	if (unlikely(status)) {
				1996	ClearPageUptodate(page);
				1997
				1998	if (ownpage) {
				1999	unlock_page(page);
				2000	page_cache_release(page);
				2001	*pagep = NULL;
				2002
				2003	/*
				2004	* prepare_write() may have instantiated a few blocks
				2005	* outside i_size. Trim these off again. Don't need
				2006	* i_size_read because we hold i_mutex.
				2007	*/
				2008	if (pos + len > inode->i_size)
				2009	vmtruncate(inode, inode->i_size);
				2010	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2011	}
				2012
				2013	out:
				2014	return status;
				2015	}
				2016	EXPORT_SYMBOL(block_write_begin);
				2017
				2018	int block_write_end(struct file file, struct address_space mapping,
				2019	loff_t pos, unsigned len, unsigned copied,
				2020	struct page page, void fsdata)
				2021	{
				2022	struct inode *inode = mapping->host;
				2023	unsigned start;
				2024
				2025	start = pos & (PAGE_CACHE_SIZE - 1);
				2026
				2027	if (unlikely(copied < len)) {
				2028	/*
				2029	* The buffers that were written will now be uptodate, so we
				2030	* don't have to worry about a readpage reading them and
				2031	* overwriting a partial write. However if we have encountered
				2032	* a short write and only partially written into a buffer, it
				2033	* will not be marked uptodate, so a readpage might come in and
				2034	* destroy our partial write.
				2035	*
				2036	* Do the simplest thing, and just treat any short write to a
				2037	* non uptodate page as a zero-length write, and force the
				2038	* caller to redo the whole thing.
				2039	*/
				2040	if (!PageUptodate(page))
				2041	copied = 0;
				2042
				2043	page_zero_new_buffers(page, start+copied, start+len);
				2044	}
				2045	flush_dcache_page(page);
				2046
				2047	/* This could be a short (even 0-length) commit */
				2048	__block_commit_write(inode, page, start, start+copied);
				2049
				2050	return copied;
				2051	}
				2052	EXPORT_SYMBOL(block_write_end);
				2053
				2054	int generic_write_end(struct file file, struct address_space mapping,
				2055	loff_t pos, unsigned len, unsigned copied,
				2056	struct page page, void fsdata)
				2057	{
				2058	struct inode *inode = mapping->host;
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2059	int i_size_changed = 0;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2060
				2061	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
				2062
				2063	/*
				2064	* No need to use i_size_read() here, the i_size
				2065	* cannot change under us because we hold i_mutex.
				2066	*
				2067	* But it's important to update i_size while still holding page lock:
				2068	* page writeout could otherwise come in and zero beyond i_size.
				2069	*/
				2070	if (pos+copied > inode->i_size) {
				2071	i_size_write(inode, pos+copied);
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2072	i_size_changed = 1;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2073	}
				2074
				2075	unlock_page(page);
				2076	page_cache_release(page);
				2077
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2078	/*
				2079	* Don't mark the inode dirty under page lock. First, it unnecessarily
				2080	* makes the holding time of page lock longer. Second, it forces lock
				2081	* ordering of page lock and transaction start for journaling
				2082	* filesystems.
				2083	*/
				2084	if (i_size_changed)
				2085	mark_inode_dirty(inode);
				2086
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2087	return copied;
				2088	}
				2089	EXPORT_SYMBOL(generic_write_end);
				2090
				2091	/*
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2092	* block_is_partially_uptodate checks whether buffers within a page are
				2093	* uptodate or not.
				2094	*
				2095	* Returns true if all buffers which correspond to a file portion
				2096	* we want to read are uptodate.
				2097	*/
				2098	int block_is_partially_uptodate(struct page page, read_descriptor_t desc,
				2099	unsigned long from)
				2100	{
				2101	struct inode *inode = page->mapping->host;
				2102	unsigned block_start, block_end, blocksize;
				2103	unsigned to;
				2104	struct buffer_head bh, head;
				2105	int ret = 1;
				2106
				2107	if (!page_has_buffers(page))
				2108	return 0;
				2109
				2110	blocksize = 1 << inode->i_blkbits;
				2111	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
				2112	to = from + to;
				2113	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
				2114	return 0;
				2115
				2116	head = page_buffers(page);
				2117	bh = head;
				2118	block_start = 0;
				2119	do {
				2120	block_end = block_start + blocksize;
				2121	if (block_end > from && block_start < to) {
				2122	if (!buffer_uptodate(bh)) {
				2123	ret = 0;
				2124	break;
				2125	}
				2126	if (block_end >= to)
				2127	break;
				2128	}
				2129	block_start = block_end;
				2130	bh = bh->b_this_page;
				2131	} while (bh != head);
				2132
				2133	return ret;
				2134	}
				2135	EXPORT_SYMBOL(block_is_partially_uptodate);
				2136
				2137	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2138	* Generic "read page" function for block devices that have the normal
				2139	* get_block functionality. This is most of the block device filesystems.
				2140	* Reads the page asynchronously --- the unlock_buffer() and
				2141	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2142	* page struct once IO has completed.
				2143	*/
				2144	int block_read_full_page(struct page page, get_block_t get_block)
				2145	{
				2146	struct inode *inode = page->mapping->host;
				2147	sector_t iblock, lblock;
				2148	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2149	unsigned int blocksize;
				2150	int nr, i;
				2151	int fully_mapped = 1;
				2152
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2153	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2154	blocksize = 1 << inode->i_blkbits;
				2155	if (!page_has_buffers(page))
				2156	create_empty_buffers(page, blocksize, 0);
				2157	head = page_buffers(page);
				2158
				2159	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2160	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2161	bh = head;
				2162	nr = 0;
				2163	i = 0;
				2164
				2165	do {
				2166	if (buffer_uptodate(bh))
				2167	continue;
				2168
				2169	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2170	int err = 0;
				2171
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2172	fully_mapped = 0;
				2173	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2174	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2175	err = get_block(inode, iblock, bh, 0);
				2176	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2177	SetPageError(page);
				2178	}
				2179	if (!buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2180	zero_user(page, i * blocksize, blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2181	if (!err)
				2182	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2183	continue;
				2184	}
				2185	/*
				2186	* get_block() might have updated the buffer
				2187	* synchronously
				2188	*/
				2189	if (buffer_uptodate(bh))
				2190	continue;
				2191	}
				2192	arr[nr++] = bh;
				2193	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2194
				2195	if (fully_mapped)
				2196	SetPageMappedToDisk(page);
				2197
				2198	if (!nr) {
				2199	/*
				2200	* All buffers are uptodate - we can set the page uptodate
				2201	* as well. But not if get_block() returned an error.
				2202	*/
				2203	if (!PageError(page))
				2204	SetPageUptodate(page);
				2205	unlock_page(page);
				2206	return 0;
				2207	}
				2208
				2209	/* Stage two: lock the buffers */
				2210	for (i = 0; i < nr; i++) {
				2211	bh = arr[i];
				2212	lock_buffer(bh);
				2213	mark_buffer_async_read(bh);
				2214	}
				2215
				2216	/*
				2217	* Stage 3: start the IO. Check for uptodateness
				2218	* inside the buffer lock in case another process reading
				2219	* the underlying blockdev brought it uptodate (the sct fix).
				2220	*/
				2221	for (i = 0; i < nr; i++) {
				2222	bh = arr[i];
				2223	if (buffer_uptodate(bh))
				2224	end_buffer_async_read(bh, 1);
				2225	else
				2226	submit_bh(READ, bh);
				2227	}
				2228	return 0;
				2229	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2230	EXPORT_SYMBOL(block_read_full_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2231
				2232	/* utility function for filesystems that need to do work on expanding
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2233	* truncates. Uses filesystem pagecache writes to allow the filesystem to
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2234	* deal with the hole.
				2235	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2236	int generic_cont_expand_simple(struct inode *inode, loff_t size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2237	{
				2238	struct address_space *mapping = inode->i_mapping;
				2239	struct page *page;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2240	void *fsdata;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2241	int err;
				2242
npiggin@suse.de	c08d3b0	2009-08-21 02:35:06 +1000	[diff] [blame]	2243	err = inode_newsize_ok(inode, size);
				2244	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2245	goto out;
				2246
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2247	err = pagecache_write_begin(NULL, mapping, size, 0,
				2248	AOP_FLAG_UNINTERRUPTIBLE\|AOP_FLAG_CONT_EXPAND,
				2249	&page, &fsdata);
				2250	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2251	goto out;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2252
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2253	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
				2254	BUG_ON(err > 0);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2255
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2256	out:
				2257	return err;
				2258	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2259	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2260
Adrian Bunk	f1e3af7	2008-04-29 00:59:01 -0700	[diff] [blame]	2261	static int cont_expand_zero(struct file file, struct address_space mapping,
				2262	loff_t pos, loff_t *bytes)
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2263	{
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2264	struct inode *inode = mapping->host;
				2265	unsigned blocksize = 1 << inode->i_blkbits;
				2266	struct page *page;
				2267	void *fsdata;
				2268	pgoff_t index, curidx;
				2269	loff_t curpos;
				2270	unsigned zerofrom, offset, len;
				2271	int err = 0;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2272
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2273	index = pos >> PAGE_CACHE_SHIFT;
				2274	offset = pos & ~PAGE_CACHE_MASK;
				2275
				2276	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
				2277	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2278	if (zerofrom & (blocksize-1)) {
				2279	*bytes \|= (blocksize-1);
				2280	(*bytes)++;
				2281	}
				2282	len = PAGE_CACHE_SIZE - zerofrom;
				2283
				2284	err = pagecache_write_begin(file, mapping, curpos, len,
				2285	AOP_FLAG_UNINTERRUPTIBLE,
				2286	&page, &fsdata);
				2287	if (err)
				2288	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2289	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2290	err = pagecache_write_end(file, mapping, curpos, len, len,
				2291	page, fsdata);
				2292	if (err < 0)
				2293	goto out;
				2294	BUG_ON(err != len);
				2295	err = 0;
OGAWA Hirofumi	061e974	2008-04-28 02:16:28 -0700	[diff] [blame]	2296
				2297	balance_dirty_pages_ratelimited(mapping);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2298	}
				2299
				2300	/* page covers the boundary, find the boundary offset */
				2301	if (index == curidx) {
				2302	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2303	/* if we will expand the thing last block will be filled */
				2304	if (offset <= zerofrom) {
				2305	goto out;
				2306	}
				2307	if (zerofrom & (blocksize-1)) {
				2308	*bytes \|= (blocksize-1);
				2309	(*bytes)++;
				2310	}
				2311	len = offset - zerofrom;
				2312
				2313	err = pagecache_write_begin(file, mapping, curpos, len,
				2314	AOP_FLAG_UNINTERRUPTIBLE,
				2315	&page, &fsdata);
				2316	if (err)
				2317	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2318	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2319	err = pagecache_write_end(file, mapping, curpos, len, len,
				2320	page, fsdata);
				2321	if (err < 0)
				2322	goto out;
				2323	BUG_ON(err != len);
				2324	err = 0;
				2325	}
				2326	out:
				2327	return err;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2328	}
				2329
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2330	/*
				2331	* For moronic filesystems that do not allow holes in file.
				2332	* We may have to extend the file.
				2333	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2334	int cont_write_begin(struct file file, struct address_space mapping,
				2335	loff_t pos, unsigned len, unsigned flags,
				2336	struct page pagep, void fsdata,
				2337	get_block_t get_block, loff_t bytes)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2338	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2339	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2340	unsigned blocksize = 1 << inode->i_blkbits;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2341	unsigned zerofrom;
				2342	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2343
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2344	err = cont_expand_zero(file, mapping, pos, bytes);
				2345	if (err)
				2346	goto out;
				2347
				2348	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2349	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
				2350	*bytes \|= (blocksize-1);
				2351	(*bytes)++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2352	}
				2353
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2354	*pagep = NULL;
				2355	err = block_write_begin(file, mapping, pos, len,
				2356	flags, pagep, fsdata, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2357	out:
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2358	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2359	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2360	EXPORT_SYMBOL(cont_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2361
				2362	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2363	get_block_t *get_block)
				2364	{
				2365	struct inode *inode = page->mapping->host;
				2366	int err = __block_prepare_write(inode, page, from, to, get_block);
				2367	if (err)
				2368	ClearPageUptodate(page);
				2369	return err;
				2370	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2371	EXPORT_SYMBOL(block_prepare_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2372
				2373	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2374	{
				2375	struct inode *inode = page->mapping->host;
				2376	__block_commit_write(inode,page,from,to);
				2377	return 0;
				2378	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2379	EXPORT_SYMBOL(block_commit_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2380
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2381	/*
				2382	* block_page_mkwrite() is not allowed to change the file size as it gets
				2383	* called from a page fault handler when a page is first dirtied. Hence we must
				2384	* be careful to check for EOF conditions here. We set the page up correctly
				2385	* for a written page which means we get ENOSPC checking when writing into
				2386	* holes and correct delalloc and unwritten extent mapping on filesystems that
				2387	* support these features.
				2388	*
				2389	* We are not allowed to take the i_mutex here so we have to play games to
				2390	* protect against truncate races as the page could now be beyond EOF. Because
				2391	* vmtruncate() writes the inode size before removing pages, once we have the
				2392	* page lock we can determine safely if the page is beyond EOF. If it is not
				2393	* beyond EOF, then the page is guaranteed safe against truncation until we
				2394	* unlock the page.
				2395	*/
				2396	int
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2397	block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2398	get_block_t get_block)
				2399	{
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2400	struct page *page = vmf->page;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2401	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
				2402	unsigned long end;
				2403	loff_t size;
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2404	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2405
				2406	lock_page(page);
				2407	size = i_size_read(inode);
				2408	if ((page->mapping != inode->i_mapping) \|\|
Nick Piggin	1833633	2007-07-20 00:31:45 -0700	[diff] [blame]	2409	(page_offset(page) > size)) {
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2410	/* page got truncated out from underneath us */
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2411	unlock_page(page);
				2412	goto out;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2413	}
				2414
				2415	/* page is wholly or partially inside EOF */
				2416	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
				2417	end = size & ~PAGE_CACHE_MASK;
				2418	else
				2419	end = PAGE_CACHE_SIZE;
				2420
				2421	ret = block_prepare_write(page, 0, end, get_block);
				2422	if (!ret)
				2423	ret = block_commit_write(page, 0, end);
				2424
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2425	if (unlikely(ret)) {
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2426	unlock_page(page);
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2427	if (ret == -ENOMEM)
				2428	ret = VM_FAULT_OOM;
				2429	else /* -ENOSPC, -EIO, etc */
				2430	ret = VM_FAULT_SIGBUS;
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2431	} else
				2432	ret = VM_FAULT_LOCKED;
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2433
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2434	out:
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2435	return ret;
				2436	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2437	EXPORT_SYMBOL(block_page_mkwrite);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2438
				2439	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2440	* nobh_write_begin()'s prereads are special: the buffer_heads are freed
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2441	* immediately, while under the page lock. So it needs a special end_io
				2442	* handler which does not touch the bh after unlocking it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2443	*/
				2444	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2445	{
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	2446	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2447	}
				2448
				2449	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2450	* Attach the singly-linked list of buffers created by nobh_write_begin, to
				2451	* the page (converting it to circular linked list and taking care of page
				2452	* dirty races).
				2453	*/
				2454	static void attach_nobh_buffers(struct page page, struct buffer_head head)
				2455	{
				2456	struct buffer_head *bh;
				2457
				2458	BUG_ON(!PageLocked(page));
				2459
				2460	spin_lock(&page->mapping->private_lock);
				2461	bh = head;
				2462	do {
				2463	if (PageDirty(page))
				2464	set_buffer_dirty(bh);
				2465	if (!bh->b_this_page)
				2466	bh->b_this_page = head;
				2467	bh = bh->b_this_page;
				2468	} while (bh != head);
				2469	attach_page_buffers(page, head);
				2470	spin_unlock(&page->mapping->private_lock);
				2471	}
				2472
				2473	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2474	* On entry, the page is fully not uptodate.
				2475	* On exit the page is fully uptodate in the areas outside (from,to)
				2476	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2477	int nobh_write_begin(struct file file, struct address_space mapping,
				2478	loff_t pos, unsigned len, unsigned flags,
				2479	struct page pagep, void fsdata,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2480	get_block_t *get_block)
				2481	{
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2482	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2483	const unsigned blkbits = inode->i_blkbits;
				2484	const unsigned blocksize = 1 << blkbits;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2485	struct buffer_head head, bh;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2486	struct page *page;
				2487	pgoff_t index;
				2488	unsigned from, to;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2489	unsigned block_in_page;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2490	unsigned block_start, block_end;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2491	sector_t block_in_file;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2492	int nr_reads = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2493	int ret = 0;
				2494	int is_mapped_to_disk = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2495
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2496	index = pos >> PAGE_CACHE_SHIFT;
				2497	from = pos & (PAGE_CACHE_SIZE - 1);
				2498	to = from + len;
				2499
Nick Piggin	54566b2	2009-01-04 12:00:53 -0800	[diff] [blame]	2500	page = grab_cache_page_write_begin(mapping, index, flags);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2501	if (!page)
				2502	return -ENOMEM;
				2503	*pagep = page;
				2504	*fsdata = NULL;
				2505
				2506	if (page_has_buffers(page)) {
				2507	unlock_page(page);
				2508	page_cache_release(page);
				2509	*pagep = NULL;
				2510	return block_write_begin(file, mapping, pos, len, flags, pagep,
				2511	fsdata, get_block);
				2512	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2513
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2514	if (PageMappedToDisk(page))
				2515	return 0;
				2516
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2517	/*
				2518	* Allocate buffers so that we can keep track of state, and potentially
				2519	* attach them to the page if an error occurs. In the common case of
				2520	* no error, they will just be freed again without ever being attached
				2521	* to the page (which is all OK, because we're under the page lock).
				2522	*
				2523	* Be careful: the buffer linked list is a NULL terminated one, rather
				2524	* than the circular one we're used to.
				2525	*/
				2526	head = alloc_page_buffers(page, blocksize, 0);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2527	if (!head) {
				2528	ret = -ENOMEM;
				2529	goto out_release;
				2530	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2531
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2532	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2533
				2534	/*
				2535	* We loop across all blocks in the page, whether or not they are
				2536	* part of the affected region. This is so we can discover if the
				2537	* page is fully mapped-to-disk.
				2538	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2539	for (block_start = 0, block_in_page = 0, bh = head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2540	block_start < PAGE_CACHE_SIZE;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2541	block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2542	int create;
				2543
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2544	block_end = block_start + blocksize;
				2545	bh->b_state = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2546	create = 1;
				2547	if (block_start >= to)
				2548	create = 0;
				2549	ret = get_block(inode, block_in_file + block_in_page,
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2550	bh, create);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2551	if (ret)
				2552	goto failed;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2553	if (!buffer_mapped(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2554	is_mapped_to_disk = 0;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2555	if (buffer_new(bh))
				2556	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
				2557	if (PageUptodate(page)) {
				2558	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2559	continue;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2560	}
				2561	if (buffer_new(bh) \|\| !buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2562	zero_user_segments(page, block_start, from,
				2563	to, block_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2564	continue;
				2565	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2566	if (buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2567	continue; /* reiserfs does this */
				2568	if (block_start < from \|\| block_end > to) {
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2569	lock_buffer(bh);
				2570	bh->b_end_io = end_buffer_read_nobh;
				2571	submit_bh(READ, bh);
				2572	nr_reads++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2573	}
				2574	}
				2575
				2576	if (nr_reads) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2577	/*
				2578	* The page is locked, so these buffers are protected from
				2579	* any VM or truncate activity. Hence we don't need to care
				2580	* for the buffer_head refcounts.
				2581	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2582	for (bh = head; bh; bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2583	wait_on_buffer(bh);
				2584	if (!buffer_uptodate(bh))
				2585	ret = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2586	}
				2587	if (ret)
				2588	goto failed;
				2589	}
				2590
				2591	if (is_mapped_to_disk)
				2592	SetPageMappedToDisk(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2593
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2594	fsdata = head; / to be released by nobh_write_end */
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2595
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2596	return 0;
				2597
				2598	failed:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2599	BUG_ON(!ret);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2600	/*
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2601	* Error recovery is a bit difficult. We need to zero out blocks that
				2602	* were newly allocated, and dirty them to ensure they get written out.
				2603	* Buffers need to be attached to the page at this point, otherwise
				2604	* the handling of potential IO errors during writeout would be hard
				2605	* (could try doing synchronous writeout, but what if that fails too?)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2606	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2607	attach_nobh_buffers(page, head);
				2608	page_zero_new_buffers(page, from, to);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2609
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2610	out_release:
				2611	unlock_page(page);
				2612	page_cache_release(page);
				2613	*pagep = NULL;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2614
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2615	if (pos + len > inode->i_size)
				2616	vmtruncate(inode, inode->i_size);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2617
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2618	return ret;
				2619	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2620	EXPORT_SYMBOL(nobh_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2621
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2622	int nobh_write_end(struct file file, struct address_space mapping,
				2623	loff_t pos, unsigned len, unsigned copied,
				2624	struct page page, void fsdata)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2625	{
				2626	struct inode *inode = page->mapping->host;
Nick Piggin	efdc313	2007-10-21 06:57:41 +0200	[diff] [blame]	2627	struct buffer_head *head = fsdata;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2628	struct buffer_head *bh;
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2629	BUG_ON(fsdata != NULL && page_has_buffers(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2630
Dave Kleikamp	d4cf109	2009-02-06 14:59:26 -0600	[diff] [blame]	2631	if (unlikely(copied < len) && head)
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2632	attach_nobh_buffers(page, head);
				2633	if (page_has_buffers(page))
				2634	return generic_write_end(file, mapping, pos, len,
				2635	copied, page, fsdata);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2636
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2637	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2638	set_page_dirty(page);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2639	if (pos+copied > inode->i_size) {
				2640	i_size_write(inode, pos+copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2641	mark_inode_dirty(inode);
				2642	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2643
				2644	unlock_page(page);
				2645	page_cache_release(page);
				2646
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2647	while (head) {
				2648	bh = head;
				2649	head = head->b_this_page;
				2650	free_buffer_head(bh);
				2651	}
				2652
				2653	return copied;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2654	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2655	EXPORT_SYMBOL(nobh_write_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2656
				2657	/*
				2658	* nobh_writepage() - based on block_full_write_page() except
				2659	* that it tries to operate without attaching bufferheads to
				2660	* the page.
				2661	*/
				2662	int nobh_writepage(struct page page, get_block_t get_block,
				2663	struct writeback_control *wbc)
				2664	{
				2665	struct inode * const inode = page->mapping->host;
				2666	loff_t i_size = i_size_read(inode);
				2667	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2668	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2669	int ret;
				2670
				2671	/* Is the page fully inside i_size? */
				2672	if (page->index < end_index)
				2673	goto out;
				2674
				2675	/* Is the page fully outside i_size? (truncate in progress) */
				2676	offset = i_size & (PAGE_CACHE_SIZE-1);
				2677	if (page->index >= end_index+1 \|\| !offset) {
				2678	/*
				2679	* The page may have dirty, unmapped buffers. For example,
				2680	* they may have been added in ext3_writepage(). Make them
				2681	* freeable here, so the page does not leak.
				2682	*/
				2683	#if 0
				2684	/* Not really sure about this - do we need this ? */
				2685	if (page->mapping->a_ops->invalidatepage)
				2686	page->mapping->a_ops->invalidatepage(page, offset);
				2687	#endif
				2688	unlock_page(page);
				2689	return 0; /* don't care */
				2690	}
				2691
				2692	/*
				2693	* The page straddles i_size. It must be zeroed out on each and every
				2694	* writepage invocation because it may be mmapped. "A file is mapped
				2695	* in multiples of the page size. For a file that is not a multiple of
				2696	* the page size, the remaining memory is zeroed when mapped, and
				2697	* writes to that region are not written out to the file."
				2698	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2699	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2700	out:
				2701	ret = mpage_writepage(page, get_block, wbc);
				2702	if (ret == -EAGAIN)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2703	ret = __block_write_full_page(inode, page, get_block, wbc,
				2704	end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2705	return ret;
				2706	}
				2707	EXPORT_SYMBOL(nobh_writepage);
				2708
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2709	int nobh_truncate_page(struct address_space *mapping,
				2710	loff_t from, get_block_t *get_block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2711	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2712	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2713	unsigned offset = from & (PAGE_CACHE_SIZE-1);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2714	unsigned blocksize;
				2715	sector_t iblock;
				2716	unsigned length, pos;
				2717	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2718	struct page *page;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2719	struct buffer_head map_bh;
				2720	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2721
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2722	blocksize = 1 << inode->i_blkbits;
				2723	length = offset & (blocksize - 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2724
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2725	/* Block boundary? Nothing to do */
				2726	if (!length)
				2727	return 0;
				2728
				2729	length = blocksize - length;
				2730	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2731
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2732	page = grab_cache_page(mapping, index);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2733	err = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2734	if (!page)
				2735	goto out;
				2736
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2737	if (page_has_buffers(page)) {
				2738	has_buffers:
				2739	unlock_page(page);
				2740	page_cache_release(page);
				2741	return block_truncate_page(mapping, from, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2742	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2743
				2744	/* Find the buffer that contains "offset" */
				2745	pos = blocksize;
				2746	while (offset >= pos) {
				2747	iblock++;
				2748	pos += blocksize;
				2749	}
				2750
Theodore Ts'o	460bcf5	2009-05-12 07:37:56 -0400	[diff] [blame]	2751	map_bh.b_size = blocksize;
				2752	map_bh.b_state = 0;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2753	err = get_block(inode, iblock, &map_bh, 0);
				2754	if (err)
				2755	goto unlock;
				2756	/* unmapped? It's a hole - nothing to do */
				2757	if (!buffer_mapped(&map_bh))
				2758	goto unlock;
				2759
				2760	/* Ok, it's mapped. Make sure it's up-to-date */
				2761	if (!PageUptodate(page)) {
				2762	err = mapping->a_ops->readpage(NULL, page);
				2763	if (err) {
				2764	page_cache_release(page);
				2765	goto out;
				2766	}
				2767	lock_page(page);
				2768	if (!PageUptodate(page)) {
				2769	err = -EIO;
				2770	goto unlock;
				2771	}
				2772	if (page_has_buffers(page))
				2773	goto has_buffers;
				2774	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2775	zero_user(page, offset, length);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2776	set_page_dirty(page);
				2777	err = 0;
				2778
				2779	unlock:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2780	unlock_page(page);
				2781	page_cache_release(page);
				2782	out:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2783	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2784	}
				2785	EXPORT_SYMBOL(nobh_truncate_page);
				2786
				2787	int block_truncate_page(struct address_space *mapping,
				2788	loff_t from, get_block_t *get_block)
				2789	{
				2790	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2791	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2792	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2793	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2794	unsigned length, pos;
				2795	struct inode *inode = mapping->host;
				2796	struct page *page;
				2797	struct buffer_head *bh;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2798	int err;
				2799
				2800	blocksize = 1 << inode->i_blkbits;
				2801	length = offset & (blocksize - 1);
				2802
				2803	/* Block boundary? Nothing to do */
				2804	if (!length)
				2805	return 0;
				2806
				2807	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2808	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2809
				2810	page = grab_cache_page(mapping, index);
				2811	err = -ENOMEM;
				2812	if (!page)
				2813	goto out;
				2814
				2815	if (!page_has_buffers(page))
				2816	create_empty_buffers(page, blocksize, 0);
				2817
				2818	/* Find the buffer that contains "offset" */
				2819	bh = page_buffers(page);
				2820	pos = blocksize;
				2821	while (offset >= pos) {
				2822	bh = bh->b_this_page;
				2823	iblock++;
				2824	pos += blocksize;
				2825	}
				2826
				2827	err = 0;
				2828	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2829	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2830	err = get_block(inode, iblock, bh, 0);
				2831	if (err)
				2832	goto unlock;
				2833	/* unmapped? It's a hole - nothing to do */
				2834	if (!buffer_mapped(bh))
				2835	goto unlock;
				2836	}
				2837
				2838	/* Ok, it's mapped. Make sure it's up-to-date */
				2839	if (PageUptodate(page))
				2840	set_buffer_uptodate(bh);
				2841
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2842	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2843	err = -EIO;
				2844	ll_rw_block(READ, 1, &bh);
				2845	wait_on_buffer(bh);
				2846	/* Uhhuh. Read error. Complain and punt. */
				2847	if (!buffer_uptodate(bh))
				2848	goto unlock;
				2849	}
				2850
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2851	zero_user(page, offset, length);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2852	mark_buffer_dirty(bh);
				2853	err = 0;
				2854
				2855	unlock:
				2856	unlock_page(page);
				2857	page_cache_release(page);
				2858	out:
				2859	return err;
				2860	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2861	EXPORT_SYMBOL(block_truncate_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2862
				2863	/*
				2864	* The generic ->writepage function for buffer-backed address_spaces
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2865	* this form passes in the end_io handler used to finish the IO.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2866	*/
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2867	int block_write_full_page_endio(struct page page, get_block_t get_block,
				2868	struct writeback_control wbc, bh_end_io_t handler)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2869	{
				2870	struct inode * const inode = page->mapping->host;
				2871	loff_t i_size = i_size_read(inode);
				2872	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2873	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2874
				2875	/* Is the page fully inside i_size? */
				2876	if (page->index < end_index)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2877	return __block_write_full_page(inode, page, get_block, wbc,
				2878	handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2879
				2880	/* Is the page fully outside i_size? (truncate in progress) */
				2881	offset = i_size & (PAGE_CACHE_SIZE-1);
				2882	if (page->index >= end_index+1 \|\| !offset) {
				2883	/*
				2884	* The page may have dirty, unmapped buffers. For example,
				2885	* they may have been added in ext3_writepage(). Make them
				2886	* freeable here, so the page does not leak.
				2887	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2888	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2889	unlock_page(page);
				2890	return 0; /* don't care */
				2891	}
				2892
				2893	/*
				2894	* The page straddles i_size. It must be zeroed out on each and every
Adam Buchbinder	2a61aa4	2009-12-11 16:35:40 -0500	[diff] [blame]	2895	* writepage invocation because it may be mmapped. "A file is mapped
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2896	* in multiples of the page size. For a file that is not a multiple of
				2897	* the page size, the remaining memory is zeroed when mapped, and
				2898	* writes to that region are not written out to the file."
				2899	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2900	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2901	return __block_write_full_page(inode, page, get_block, wbc, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2902	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2903	EXPORT_SYMBOL(block_write_full_page_endio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2904
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2905	/*
				2906	* The generic ->writepage function for buffer-backed address_spaces
				2907	*/
				2908	int block_write_full_page(struct page page, get_block_t get_block,
				2909	struct writeback_control *wbc)
				2910	{
				2911	return block_write_full_page_endio(page, get_block, wbc,
				2912	end_buffer_async_write);
				2913	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2914	EXPORT_SYMBOL(block_write_full_page);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2915
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2916	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2917	get_block_t *get_block)
				2918	{
				2919	struct buffer_head tmp;
				2920	struct inode *inode = mapping->host;
				2921	tmp.b_state = 0;
				2922	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2923	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2924	get_block(inode, block, &tmp, 0);
				2925	return tmp.b_blocknr;
				2926	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2927	EXPORT_SYMBOL(generic_block_bmap);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2928
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	2929	static void end_bio_bh_io_sync(struct bio *bio, int err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2930	{
				2931	struct buffer_head *bh = bio->bi_private;
				2932
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2933	if (err == -EOPNOTSUPP) {
				2934	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2935	set_bit(BH_Eopnotsupp, &bh->b_state);
				2936	}
				2937
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	2938	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
				2939	set_bit(BH_Quiet, &bh->b_state);
				2940
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2941	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2942	bio_put(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2943	}
				2944
				2945	int submit_bh(int rw, struct buffer_head * bh)
				2946	{
				2947	struct bio *bio;
				2948	int ret = 0;
				2949
				2950	BUG_ON(!buffer_locked(bh));
				2951	BUG_ON(!buffer_mapped(bh));
				2952	BUG_ON(!bh->b_end_io);
Aneesh Kumar K.V	8fb0e34	2009-05-12 16:22:37 -0400	[diff] [blame]	2953	BUG_ON(buffer_delay(bh));
				2954	BUG_ON(buffer_unwritten(bh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2955
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2956	/*
				2957	* Mask in barrier bit for a write (could be either a WRITE or a
				2958	* WRITE_SYNC
				2959	*/
				2960	if (buffer_ordered(bh) && (rw & WRITE))
				2961	rw \|= WRITE_BARRIER;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2962
				2963	/*
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2964	* Only clear out a write error when rewriting
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2965	*/
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2966	if (test_set_buffer_req(bh) && (rw & WRITE))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2967	clear_buffer_write_io_error(bh);
				2968
				2969	/*
				2970	* from here on down, it's all bio -- do the initial mapping,
				2971	* submit_bio -> generic_make_request may further map this bio around
				2972	*/
				2973	bio = bio_alloc(GFP_NOIO, 1);
				2974
				2975	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2976	bio->bi_bdev = bh->b_bdev;
				2977	bio->bi_io_vec[0].bv_page = bh->b_page;
				2978	bio->bi_io_vec[0].bv_len = bh->b_size;
				2979	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2980
				2981	bio->bi_vcnt = 1;
				2982	bio->bi_idx = 0;
				2983	bio->bi_size = bh->b_size;
				2984
				2985	bio->bi_end_io = end_bio_bh_io_sync;
				2986	bio->bi_private = bh;
				2987
				2988	bio_get(bio);
				2989	submit_bio(rw, bio);
				2990
				2991	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2992	ret = -EOPNOTSUPP;
				2993
				2994	bio_put(bio);
				2995	return ret;
				2996	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2997	EXPORT_SYMBOL(submit_bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2998
				2999	/**
				3000	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3001	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3002	* @nr: number of &struct buffer_heads in the array
				3003	* @bhs: array of pointers to &struct buffer_head
				3004	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3005	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				3006	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				3007	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				3008	* are sent to disk. The fourth %READA option is described in the documentation
				3009	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3010	*
				3011	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3012	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				3013	* clean when doing a write request, and any buffer that appears to be
				3014	* up-to-date when doing read request. Further it marks as clean buffers that
				3015	* are processed for writing (the buffer cache won't assume that they are
				3016	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3017	*
				3018	* ll_rw_block sets b_end_io to simple completion handler that marks
				3019	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				3020	* any waiters.
				3021	*
				3022	* All of the buffers must be for the same device, and must also be a
				3023	* multiple of the current approved size for the device.
				3024	*/
				3025	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				3026	{
				3027	int i;
				3028
				3029	for (i = 0; i < nr; i++) {
				3030	struct buffer_head *bh = bhs[i];
				3031
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	3032	if (rw == SWRITE \|\| rw == SWRITE_SYNC \|\| rw == SWRITE_SYNC_PLUG)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3033	lock_buffer(bh);
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	3034	else if (!trylock_buffer(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3035	continue;
				3036
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	3037	if (rw == WRITE \|\| rw == SWRITE \|\| rw == SWRITE_SYNC \|\|
				3038	rw == SWRITE_SYNC_PLUG) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3039	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	3040	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	3041	get_bh(bh);
Jens Axboe	18ce375	2008-07-01 09:07:34 +0200	[diff] [blame]	3042	if (rw == SWRITE_SYNC)
				3043	submit_bh(WRITE_SYNC, bh);
				3044	else
				3045	submit_bh(WRITE, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3046	continue;
				3047	}
				3048	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3049	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	3050	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	3051	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3052	submit_bh(rw, bh);
				3053	continue;
				3054	}
				3055	}
				3056	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3057	}
				3058	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	3059	EXPORT_SYMBOL(ll_rw_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3060
				3061	/*
				3062	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				3063	* and then start new I/O and then wait upon it. The caller must have a ref on
				3064	* the buffer_head.
				3065	*/
				3066	int sync_dirty_buffer(struct buffer_head *bh)
				3067	{
				3068	int ret = 0;
				3069
				3070	WARN_ON(atomic_read(&bh->b_count) < 1);
				3071	lock_buffer(bh);
				3072	if (test_clear_buffer_dirty(bh)) {
				3073	get_bh(bh);
				3074	bh->b_end_io = end_buffer_write_sync;
Jens Axboe	1aa2a7c	2009-04-06 14:48:08 +0200	[diff] [blame]	3075	ret = submit_bh(WRITE_SYNC, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3076	wait_on_buffer(bh);
				3077	if (buffer_eopnotsupp(bh)) {
				3078	clear_buffer_eopnotsupp(bh);
				3079	ret = -EOPNOTSUPP;
				3080	}
				3081	if (!ret && !buffer_uptodate(bh))
				3082	ret = -EIO;
				3083	} else {
				3084	unlock_buffer(bh);
				3085	}
				3086	return ret;
				3087	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	3088	EXPORT_SYMBOL(sync_dirty_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3089
				3090	/*
				3091	* try_to_free_buffers() checks if all the buffers on this particular page
				3092	* are unused, and releases them if so.
				3093	*
				3094	* Exclusion against try_to_free_buffers may be obtained by either
				3095	* locking the page or by holding its mapping's private_lock.
				3096	*
				3097	* If the page is dirty but all the buffers are clean then we need to
				3098	* be sure to mark the page clean as well. This is because the page
				3099	* may be against a block device, and a later reattachment of buffers
				3100	* to a dirty page will set all buffers dirty. Which would corrupt
				3101	* filesystem data on the same device.
				3102	*
				3103	* The same applies to regular filesystem pages: if all the buffers are
				3104	* clean then we set the page clean and proceed. To do that, we require
				3105	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				3106	* private_lock.
				3107	*
				3108	* try_to_free_buffers() is non-blocking.
				3109	*/
				3110	static inline int buffer_busy(struct buffer_head *bh)
				3111	{
				3112	return atomic_read(&bh->b_count) \|
				3113	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				3114	}
				3115
				3116	static int
				3117	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				3118	{
				3119	struct buffer_head *head = page_buffers(page);
				3120	struct buffer_head *bh;
				3121
				3122	bh = head;
				3123	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	3124	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3125	set_bit(AS_EIO, &page->mapping->flags);
				3126	if (buffer_busy(bh))
				3127	goto failed;
				3128	bh = bh->b_this_page;
				3129	} while (bh != head);
				3130
				3131	do {
				3132	struct buffer_head *next = bh->b_this_page;
				3133
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	3134	if (bh->b_assoc_map)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3135	__remove_assoc_queue(bh);
				3136	bh = next;
				3137	} while (bh != head);
				3138	*buffers_to_free = head;
				3139	__clear_page_buffers(page);
				3140	return 1;
				3141	failed:
				3142	return 0;
				3143	}
				3144
				3145	int try_to_free_buffers(struct page *page)
				3146	{
				3147	struct address_space * const mapping = page->mapping;
				3148	struct buffer_head *buffers_to_free = NULL;
				3149	int ret = 0;
				3150
				3151	BUG_ON(!PageLocked(page));
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3152	if (PageWriteback(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3153	return 0;
				3154
				3155	if (mapping == NULL) { /* can this still happen? */
				3156	ret = drop_buffers(page, &buffers_to_free);
				3157	goto out;
				3158	}
				3159
				3160	spin_lock(&mapping->private_lock);
				3161	ret = drop_buffers(page, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3162
				3163	/*
				3164	* If the filesystem writes its buffers by hand (eg ext3)
				3165	* then we can have clean buffers against a dirty page. We
				3166	* clean the page here; otherwise the VM will never notice
				3167	* that the filesystem did any IO at all.
				3168	*
				3169	* Also, during truncate, discard_buffer will have marked all
				3170	* the page's buffers clean. We discover that here and clean
				3171	* the page also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3172	*
				3173	* private_lock must be held over this entire operation in order
				3174	* to synchronise against __set_page_dirty_buffers and prevent the
				3175	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3176	*/
				3177	if (ret)
				3178	cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3179	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3180	out:
				3181	if (buffers_to_free) {
				3182	struct buffer_head *bh = buffers_to_free;
				3183
				3184	do {
				3185	struct buffer_head *next = bh->b_this_page;
				3186	free_buffer_head(bh);
				3187	bh = next;
				3188	} while (bh != buffers_to_free);
				3189	}
				3190	return ret;
				3191	}
				3192	EXPORT_SYMBOL(try_to_free_buffers);
				3193
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	3194	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3195	{
				3196	struct address_space *mapping;
				3197
				3198	smp_mb();
				3199	mapping = page_mapping(page);
				3200	if (mapping)
				3201	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3202	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	3203	EXPORT_SYMBOL(block_sync_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3204
				3205	/*
				3206	* There are no bdflush tunables left. But distributions are
				3207	* still running obsolete flush daemons, so we terminate them here.
				3208	*
				3209	* Use of bdflush() is deprecated and will be removed in a future kernel.
Jens Axboe	5b0830c	2009-09-23 19:37:09 +0200	[diff] [blame]	3210	* The `flush-X' kernel threads fully replace bdflush daemons and this call.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3211	*/
Heiko Carstens	bdc480e	2009-01-14 14:14:12 +0100	[diff] [blame]	3212	SYSCALL_DEFINE2(bdflush, int, func, long, data)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3213	{
				3214	static int msg_count;
				3215
				3216	if (!capable(CAP_SYS_ADMIN))
				3217	return -EPERM;
				3218
				3219	if (msg_count < 5) {
				3220	msg_count++;
				3221	printk(KERN_INFO
				3222	"warning: process `%s' used the obsolete bdflush"
				3223	" system call\n", current->comm);
				3224	printk(KERN_INFO "Fix your initscripts?\n");
				3225	}
				3226
				3227	if (func == 1)
				3228	do_exit(0);
				3229	return 0;
				3230	}
				3231
				3232	/*
				3233	* Buffer-head allocation
				3234	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	3235	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3236
				3237	/*
				3238	* Once the number of bh's in the machine exceeds this level, we start
				3239	* stripping them in writeback.
				3240	*/
				3241	static int max_buffer_heads;
				3242
				3243	int buffer_heads_over_limit;
				3244
				3245	struct bh_accounting {
				3246	int nr; /* Number of live bh's */
				3247	int ratelimit; /* Limit cacheline bouncing */
				3248	};
				3249
				3250	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3251
				3252	static void recalc_bh_state(void)
				3253	{
				3254	int i;
				3255	int tot = 0;
				3256
				3257	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3258	return;
				3259	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3260	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3261	tot += per_cpu(bh_accounting, i).nr;
				3262	buffer_heads_over_limit = (tot > max_buffer_heads);
				3263	}
				3264
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3265	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3266	{
Richard Kennedy	019b4d1	2010-03-10 15:20:33 -0800	[diff] [blame]	3267	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3268	if (ret) {
Christoph Lameter	a35afb8	2007-05-16 22:10:57 -0700	[diff] [blame]	3269	INIT_LIST_HEAD(&ret->b_assoc_buffers);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3270	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3271	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3272	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3273	}
				3274	return ret;
				3275	}
				3276	EXPORT_SYMBOL(alloc_buffer_head);
				3277
				3278	void free_buffer_head(struct buffer_head *bh)
				3279	{
				3280	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3281	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3282	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3283	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3284	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3285	}
				3286	EXPORT_SYMBOL(free_buffer_head);
				3287
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3288	static void buffer_exit_cpu(int cpu)
				3289	{
				3290	int i;
				3291	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3292
				3293	for (i = 0; i < BH_LRU_SIZE; i++) {
				3294	brelse(b->bhs[i]);
				3295	b->bhs[i] = NULL;
				3296	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3297	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3298	per_cpu(bh_accounting, cpu).nr = 0;
				3299	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3300	}
				3301
				3302	static int buffer_cpu_notify(struct notifier_block *self,
				3303	unsigned long action, void *hcpu)
				3304	{
Rafael J. Wysocki	8bb7844	2007-05-09 02:35:10 -0700	[diff] [blame]	3305	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3306	buffer_exit_cpu((unsigned long)hcpu);
				3307	return NOTIFY_OK;
				3308	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3309
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3310	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3311	* bh_uptodate_or_lock - Test whether the buffer is uptodate
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3312	* @bh: struct buffer_head
				3313	*
				3314	* Return true if the buffer is up-to-date and false,
				3315	* with the buffer locked, if not.
				3316	*/
				3317	int bh_uptodate_or_lock(struct buffer_head *bh)
				3318	{
				3319	if (!buffer_uptodate(bh)) {
				3320	lock_buffer(bh);
				3321	if (!buffer_uptodate(bh))
				3322	return 0;
				3323	unlock_buffer(bh);
				3324	}
				3325	return 1;
				3326	}
				3327	EXPORT_SYMBOL(bh_uptodate_or_lock);
				3328
				3329	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3330	* bh_submit_read - Submit a locked buffer for reading
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3331	* @bh: struct buffer_head
				3332	*
				3333	* Returns zero on success and -EIO on error.
				3334	*/
				3335	int bh_submit_read(struct buffer_head *bh)
				3336	{
				3337	BUG_ON(!buffer_locked(bh));
				3338
				3339	if (buffer_uptodate(bh)) {
				3340	unlock_buffer(bh);
				3341	return 0;
				3342	}
				3343
				3344	get_bh(bh);
				3345	bh->b_end_io = end_buffer_read_sync;
				3346	submit_bh(READ, bh);
				3347	wait_on_buffer(bh);
				3348	if (buffer_uptodate(bh))
				3349	return 0;
				3350	return -EIO;
				3351	}
				3352	EXPORT_SYMBOL(bh_submit_read);
				3353
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3354	void __init buffer_init(void)
				3355	{
				3356	int nrpages;
				3357
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3358	bh_cachep = kmem_cache_create("buffer_head",
				3359	sizeof(struct buffer_head), 0,
				3360	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3361	SLAB_MEM_SPREAD),
Richard Kennedy	019b4d1	2010-03-10 15:20:33 -0800	[diff] [blame]	3362	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3363
				3364	/*
				3365	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3366	*/
				3367	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3368	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3369	hotcpu_notifier(buffer_cpu_notify, 0);
				3370	}