11 files changed, 87 insertions, 237 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 6a34a0f4d37c..06d443450f21 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -397,7 +397,8 @@ prototypes:
 	int (*release) (struct gendisk *, fmode_t);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
+	int (*direct_access) (struct block_device *, sector_t, void __pmem **,
+				unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index d11cc2f8077b..c772b47e7ef0 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -61,7 +61,7 @@ Options with (*) are default options and will not show in the mount options.
 
 	check_int enables the integrity checker module, which examines all
 	block write requests to ensure on-disk consistency, at a large
-	memory and CPU cost.  
+	memory and CPU cost.
 
 	check_int_data includes extent data in the integrity checks, and
 	implies the check_int option.
@@ -113,7 +113,7 @@ Options with (*) are default options and will not show in the mount options.
 	Disable/enable debugging option to be more verbose in some ENOSPC conditions.
 
   fatal_errors=<action>
-	Action to take when encountering a fatal error: 
+	Action to take when encountering a fatal error:
 	  "bug" - BUG() on a fatal error.  This is the default.
 	  "panic" - panic() on a fatal error.
 
@@ -132,10 +132,10 @@ Options with (*) are default options and will not show in the mount options.
 
   max_inline=<bytes>
 	Specify the maximum amount of space, in bytes, that can be inlined in
-	a metadata B-tree leaf.  The value is specified in bytes, optionally 
+	a metadata B-tree leaf.  The value is specified in bytes, optionally
 	with a K, M, or G suffix, case insensitive.  In practice, this value
 	is limited by the root sector size, with some space unavailable due
-	to leaf headers.  For a 4k sectorsize, max inline data is ~3900 bytes.
+	to leaf headers.  For a 4k sector size, max inline data is ~3900 bytes.
 
   metadata_ratio=<value>
 	Specify that 1 metadata chunk should be allocated after every <value>
@@ -170,7 +170,7 @@ Options with (*) are default options and will not show in the mount options.
 
   recovery
 	Enable autorecovery attempts if a bad tree root is found at mount time.
-	Currently this scans a list of several previous tree roots and tries to 
+	Currently this scans a list of several previous tree roots and tries to
 	use the first readable.
 
   rescan_uuid_tree
@@ -194,7 +194,7 @@ Options with (*) are default options and will not show in the mount options.
   ssd_spread
 	Options to control ssd allocation schemes.  By default, BTRFS will
 	enable or disable ssd allocation heuristics depending on whether a
-	rotational or nonrotational disk is in use.  The ssd and nossd options
+	rotational or non-rotational disk is in use.  The ssd and nossd options
 	can override this autodetection.
 
 	The ssd_spread mount option attempts to allocate into big chunks
@@ -216,13 +216,13 @@ Options with (*) are default options and will not show in the mount options.
 	This allows mounting of subvolumes which are not in the root of the mounted
 	filesystem.
 	You can use "btrfs subvolume show " to see the object ID for a subvolume.
-	
+
   thread_pool=<number>
 	The number of worker threads to allocate.  The default number is equal
 	to the number of CPUs + 2, or 8, whichever is smaller.
 
   user_subvol_rm_allowed
-	Allow subvolumes to be deleted by a non-root user. Use with caution. 
+	Allow subvolumes to be deleted by a non-root user. Use with caution.
 
 MAILING LIST
 ============
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 7af2851d667c..7bde64014a89 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -60,9 +60,10 @@ Filesystem support consists of
 - implementing the direct_IO address space operation, and calling
   dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
 - implementing an mmap file operation for DAX files which sets the
-  VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
-  for fault and page_mkwrite (which should probably call dax_fault() and
-  dax_mkwrite(), passing the appropriate get_block() callback)
+  VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
+  include handlers for fault, pmd_fault and page_mkwrite (which should
+  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
+  appropriate get_block() callback)
 - calling dax_truncate_page() instead of block_truncate_page() for DAX files
 - calling dax_zero_page_range() instead of zero_user() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 88ab81c79109..463f595733e8 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -51,6 +51,17 @@ operations should be provided; others can be included as needed.  Again,
 the return value will be a dentry pointer to the created file, NULL for
 error, or ERR_PTR(-ENODEV) if debugfs support is missing.
 
+Create a file with an initial size, the following function can be used
+instead:
+
+    struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+				struct dentry *parent, void *data,
+				const struct file_operations *fops,
+				loff_t file_size);
+
+file_size is the initial file size. The other parameters are the same
+as the function debugfs_create_file.
+
 In a number of cases, the creation of a set of file operations is not
 actually necessary; the debugfs code provides a number of helper functions
 for simple situations.  Files containing a single integer value can be
@@ -100,6 +111,14 @@ A read on the resulting file will yield either Y (for non-zero values) or
 N, followed by a newline.  If written to, it will accept either upper- or
 lower-case values, or 1 or 0.  Any other input will be silently ignored.
 
+Also, atomic_t values can be placed in debugfs with:
+
+    struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+				struct dentry *parent, atomic_t *value)
+
+A read of this file will get atomic_t values, and a write of this file
+will set atomic_t values.
+
 Another option is exporting a block of arbitrary binary data, with
 this structure and function:
 
@@ -147,6 +166,27 @@ The "base" argument may be 0, but you may want to build the reg32 array
 using __stringify, and a number of register names (macros) are actually
 byte offsets over a base for the register block.
 
+If you want to dump an u32 array in debugfs, you can create file with:
+
+    struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+			struct dentry *parent,
+			u32 *array, u32 elements);
+
+The "array" argument provides data, and the "elements" argument is
+the number of elements in the array. Note: Once array is created its
+size can not be changed.
+
+There is a helper function to create device related seq_file:
+
+   struct dentry *debugfs_create_devm_seqfile(struct device *dev,
+				const char *name,
+				struct dentry *parent,
+				int (*read_fn)(struct seq_file *s,
+					void *data));
+
+The "dev" argument is the device related to this debugfs file, and
+the "read_fn" is a function pointer which to be called to print the
+seq_file content.
 
 There are a couple of other directory-oriented helper functions:
 
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index b9714569e472..55755395d3dc 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -360,8 +360,8 @@ and are copied into the filesystem.  If a transaction is incomplete at
 the time of the crash, then there is no guarantee of consistency for
 the blocks in that transaction so they are discarded (which means any
 filesystem changes they represent are also lost).
-Check Documentation/filesystems/ext3.txt if you want to read more about
-ext3 and journaling.
+Check Documentation/filesystems/ext4.txt if you want to read more about
+ext4 and journaling.
 
 References
 ==========
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 7ed0d17d6721..58758fbef9e0 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -6,210 +6,7 @@ Ext3 was originally released in September 1999. Written by Stephen Tweedie
 for the 2.2 branch, and ported to 2.4 kernels by Peter Braam, Andreas Dilger,
 Andrew Morton, Alexander Viro, Ted Ts'o and Stephen Tweedie.
 
-Ext3 is the ext2 filesystem enhanced with journalling capabilities.
+Ext3 is the ext2 filesystem enhanced with journalling capabilities. The
+filesystem is a subset of ext4 filesystem so use ext4 driver for accessing
+ext3 filesystems.
 
-Options
-=======
-
-When mounting an ext3 filesystem, the following option are accepted:
-(*) == default
-
-ro			Mount filesystem read only. Note that ext3 will replay
-			the journal (and thus write to the partition) even when
-			mounted "read only". Mount options "ro,noload" can be
-			used to prevent writes to the filesystem.
-
-journal=update		Update the ext3 file system's journal to the current
-			format.
-
-journal=inum		When a journal already exists, this option is ignored.
-			Otherwise, it specifies the number of the inode which
-			will represent the ext3 file system's journal file.
-
-journal_path=path
-journal_dev=devnum	When the external journal device's major/minor numbers
-			have changed, these options allow the user to specify
-			the new journal location.  The journal device is
-			identified through either its new major/minor numbers
-			encoded in devnum, or via a path to the device.
-
-norecovery		Don't load the journal on mounting. Note that this forces
-noload			mount of inconsistent filesystem, which can lead to
-			various problems.
-
-data=journal		All data are committed into the journal prior to being
-			written into the main file system.
-
-data=ordered	(*)	All data are forced directly out to the main file
-			system prior to its metadata being committed to the
-			journal.
-
-data=writeback		Data ordering is not preserved, data may be written
-			into the main file system after its metadata has been
-			committed to the journal.
-
-commit=nrsec	(*)	Ext3 can be told to sync all its data and metadata
-			every 'nrsec' seconds. The default value is 5 seconds.
-			This means that if you lose your power, you will lose
-			as much as the latest 5 seconds of work (your
-			filesystem will not be damaged though, thanks to the
-			journaling).  This default value (or any low value)
-			will hurt performance, but it's good for data-safety.
-			Setting it to 0 will have the same effect as leaving
-			it at the default (5 seconds).
-			Setting it to very large values will improve
-			performance.
-
-barrier=<0|1(*)>	This enables/disables the use of write barriers in
-barrier	(*)		the jbd code.  barrier=0 disables, barrier=1 enables.
-nobarrier		This also requires an IO stack which can support
-			barriers, and if jbd gets an error on a barrier
-			write, it will disable again with a warning.
-			Write barriers enforce proper on-disk ordering
-			of journal commits, making volatile disk write caches
-			safe to use, at some performance penalty.  If
-			your disks are battery-backed in one way or another,
-			disabling barriers may safely improve performance.
-			The mount options "barrier" and "nobarrier" can
-			also be used to enable or disable barriers, for
-			consistency with other ext3 mount options.
-
-user_xattr		Enables Extended User Attributes.  Additionally, you
-			need to have extended attribute support enabled in the
-			kernel configuration (CONFIG_EXT3_FS_XATTR).  See the
-			attr(5) manual page and http://acl.bestbits.at/ to
-			learn more about extended attributes.
-
-nouser_xattr		Disables Extended User Attributes.
-
-acl			Enables POSIX Access Control Lists support.
-			Additionally, you need to have ACL support enabled in
-			the kernel configuration (CONFIG_EXT3_FS_POSIX_ACL).
-			See the acl(5) manual page and http://acl.bestbits.at/
-			for more information.
-
-noacl			This option disables POSIX Access Control List
-			support.
-
-reservation
-
-noreservation
-
-bsddf 		(*)	Make 'df' act like BSD.
-minixdf			Make 'df' act like Minix.
-
-check=none		Don't do extra checking of bitmaps on mount.
-nocheck
-
-debug			Extra debugging information is sent to syslog.
-
-errors=remount-ro	Remount the filesystem read-only on an error.
-errors=continue		Keep going on a filesystem error.
-errors=panic		Panic and halt the machine if an error occurs.
-			(These mount options override the errors behavior
-			specified in the superblock, which can be
-			configured using tune2fs.)
-
-data_err=ignore(*)	Just print an error message if an error occurs
-			in a file data buffer in ordered mode.
-data_err=abort		Abort the journal if an error occurs in a file
-			data buffer in ordered mode.
-
-grpid			Give objects the same group ID as their creator.
-bsdgroups
-
-nogrpid		(*)	New objects have the group ID of their creator.
-sysvgroups
-
-resgid=n		The group ID which may use the reserved blocks.
-
-resuid=n		The user ID which may use the reserved blocks.
-
-sb=n			Use alternate superblock at this location.
-
-quota			These options are ignored by the filesystem. They
-noquota			are used only by quota tools to recognize volumes
-grpquota		where quota should be turned on. See documentation
-usrquota		in the quota-tools package for more details
-			(http://sourceforge.net/projects/linuxquota).
-
-jqfmt=<quota type>	These options tell filesystem details about quota
-usrjquota=<file>	so that quota information can be properly updated
-grpjquota=<file>	during journal replay. They replace the above
-			quota options. See documentation in the quota-tools
-			package for more details
-			(http://sourceforge.net/projects/linuxquota).
-
-Specification
-=============
-Ext3 shares all disk implementation with the ext2 filesystem, and adds
-transactions capabilities to ext2.  Journaling is done by the Journaling Block
-Device layer.
-
-Journaling Block Device layer
------------------------------
-The Journaling Block Device layer (JBD) isn't ext3 specific.  It was designed
-to add journaling capabilities to a block device.  The ext3 filesystem code
-will inform the JBD of modifications it is performing (called a transaction).
-The journal supports the transactions start and stop, and in case of a crash,
-the journal can replay the transactions to quickly put the partition back into
-a consistent state.
-
-Handles represent a single atomic update to a filesystem.  JBD can handle an
-external journal on a block device.
-
-Data Mode
----------
-There are 3 different data modes:
-
-* writeback mode
-In data=writeback mode, ext3 does not journal data at all.  This mode provides
-a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
-mode - metadata journaling.  A crash+recovery can cause incorrect data to
-appear in files which were written shortly before the crash.  This mode will
-typically provide the best ext3 performance.
-
-* ordered mode
-In data=ordered mode, ext3 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction.  When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first.  In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
-
-* journal mode
-data=journal mode provides full data and metadata journaling.  All new data is
-written to the journal first, and then to its final location.
-In the event of a crash, the journal can be replayed, bringing both data and
-metadata into a consistent state.  This mode is the slowest except when data
-needs to be read from and written to disk at the same time where it
-outperforms all other modes.
-
-Compatibility
--------------
-
-Ext2 partitions can be easily convert to ext3, with `tune2fs -j <dev>`.
-Ext3 is fully compatible with Ext2.  Ext3 partitions can easily be mounted as
-Ext2.
-
-
-External Tools
-==============
-See manual pages to learn more.
-
-tune2fs: 	create a ext3 journal on a ext2 partition with the -j flag.
-mke2fs: 	create a ext3 partition with the -j flag.
-debugfs: 	ext2 and ext3 file system debugger.
-ext2online:	online (mounted) ext2 and ext3 filesystem resizer
-
-
-References
-==========
-
-kernel source:	<file:fs/ext3/>
-		<file:fs/jbd/>
-
-programs: 	http://e2fsprogs.sourceforge.net/
-		http://ext2resize.sourceforge.net
-
-useful links:	http://www.ibm.com/developerworks/library/l-fs7/index.html
-        http://www.ibm.com/developerworks/library/l-fs8/index.html
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index e9e750e59efc..e2d5105b7214 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -143,7 +143,9 @@ fastboot               This option is used when a system wants to reduce mount
 extent_cache           Enable an extent cache based on rb-tree, it can cache
                        as many as extent which map between contiguous logical
                        address and physical address per inode, resulting in
-                       increasing the cache hit ratio.
+                       increasing the cache hit ratio. Set by default.
+noextent_cache         Diable an extent cache based on rb-tree explicitly, see
+                       the above extent_cache mount option.
 noinline_data          Disable the inline data feature, inline data feature is
                        enabled by default.
 
diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
index 95c13aa575ff..906b6c233f62 100644
--- a/Documentation/filesystems/nfs/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
@@ -138,9 +138,9 @@ Installation
   - Build, install, reboot
 
     The NFS/RDMA code will be enabled automatically if NFS and RDMA
-    are turned on. The NFS/RDMA client and server are configured via the
-    SUNRPC_XPRT_RDMA_CLIENT and SUNRPC_XPRT_RDMA_SERVER config options that both
-    depend on SUNRPC and INFINIBAND. The default value of both options will be:
+    are turned on. The NFS/RDMA client and server are configured via the hidden
+    SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
+    value of SUNRPC_XPRT_RDMA will be:
 
      - N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
        and server will not be built
@@ -238,9 +238,8 @@ NFS/RDMA Setup
 
   - Start the NFS server
 
-    If the NFS/RDMA server was built as a module
-    (CONFIG_SUNRPC_XPRT_RDMA_SERVER=m in kernel config), load the RDMA
-    transport module:
+    If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+    kernel config), load the RDMA transport module:
 
     $ modprobe svcrdma
 
@@ -259,9 +258,8 @@ NFS/RDMA Setup
 
   - On the client system
 
-    If the NFS/RDMA client was built as a module
-    (CONFIG_SUNRPC_XPRT_RDMA_CLIENT=m in kernel config), load the RDMA client
-    module:
+    If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+    kernel config), load the RDMA client module:
 
     $ modprobe xprtrdma.ko
 
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 6f7fafde0884..d411ca63c8b6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -424,6 +424,7 @@ Private_Dirty:         0 kB
 Referenced:          892 kB
 Anonymous:             0 kB
 Swap:                  0 kB
+SwapPss:               0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:              374 kB
@@ -433,16 +434,23 @@ the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
 (size), the amount of the mapping that is currently resident in RAM (RSS), the
 process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping.  Note that even a page which is part of a
-MAP_SHARED mapping, but has only a single pte mapped, i.e.  is currently used
-by only one process, is accounted as private and not as shared.  "Referenced"
-indicates the amount of memory currently marked as referenced or accessed.
+dirty private pages in the mapping.
+
+The "proportional set size" (PSS) of a process is the count of pages it has
+in memory, where each page is divided by the number of processes sharing it.
+So if a process has 1000 pages all to itself, and 1000 shared with one other
+process, its PSS will be 1500.
+Note that even a page which is part of a MAP_SHARED mapping, but has only
+a single pte mapped, i.e.  is currently used by only one process, is accounted
+as private and not as shared.
+"Referenced" indicates the amount of memory currently marked as referenced or
+accessed.
 "Anonymous" shows the amount of memory that does not belong to any file.  Even
 a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
 and a page is modified, the file page is replaced by a private anonymous copy.
 "Swap" shows how much would-be-anonymous memory is also used, but out on
 swap.
-
+"SwapPss" shows proportional swap share of this mapping.
 "VmFlags" field deserves a separate description. This member represents the kernel
 flags associated with the particular virtual memory area in two letter encoded
 manner. The codes are the following:
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index b35a64b82f9e..9494afb9476a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -212,7 +212,10 @@ Other notes:
 - show() methods should return the number of bytes printed into the
   buffer. This is the return value of scnprintf().
 
-- show() should always use scnprintf().
+- show() must not use snprintf() when formatting the value to be
+  returned to user space. If you can guarantee that an overflow
+  will never happen you can use sprintf() otherwise you must use
+  scnprintf().
 
 - store() should return the number of bytes used from the buffer. If the
   entire buffer has been used, just return the count argument.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5eb8456fc41e..8c6f07ad373a 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -769,7 +769,7 @@ struct address_space_operations {
 	to stall to allow flushers a chance to complete some IO. Ordinarily
 	it can use PageDirty and PageWriteback but some filesystems have
 	more complex state (unstable pages in NFS prevent reclaim) or
-	do not set those flags due to locking problems (jbd). This callback
+	do not set those flags due to locking problems. This callback
 	allows a filesystem to indicate to the VM if a page should be
 	treated as dirty or writeback for the purposes of stalling.