aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorFathi Boudra <fathi.boudra@linaro.org>2013-04-28 09:33:08 +0300
committerFathi Boudra <fathi.boudra@linaro.org>2013-04-28 09:33:08 +0300
commit3b4bd47f8f4ed3aaf7c81c9b5d2d37ad79fadf4a (patch)
treeb9996006addfd7ae70a39672b76843b49aebc189 /mm
downloadlinux-linaro-highbank-master.tar.gz
Imported Upstream version 3.9.0HEADupstream/3.9.0upstreammaster
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig473
-rw-r--r--mm/Kconfig.debug29
-rw-r--r--mm/Makefile60
-rw-r--r--mm/backing-dev.c869
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c867
-rw-r--r--mm/bounce.c341
-rw-r--r--mm/cleancache.c220
-rw-r--r--mm/compaction.c1209
-rw-r--r--mm/debug-pagealloc.c102
-rw-r--r--mm/dmapool.c514
-rw-r--r--mm/fadvise.c170
-rw-r--r--mm/failslab.c60
-rw-r--r--mm/filemap.c2578
-rw-r--r--mm/filemap_xip.c486
-rw-r--r--mm/fremap.c267
-rw-r--r--mm/frontswap.c370
-rw-r--r--mm/highmem.c424
-rw-r--r--mm/huge_memory.c2779
-rw-r--r--mm/hugetlb.c3184
-rw-r--r--mm/hugetlb_cgroup.c426
-rw-r--r--mm/hwpoison-inject.c141
-rw-r--r--mm/init-mm.c25
-rw-r--r--mm/internal.h377
-rw-r--r--mm/interval_tree.c112
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/kmemleak-test.c109
-rw-r--r--mm/kmemleak.c1866
-rw-r--r--mm/ksm.c2441
-rw-r--r--mm/maccess.c62
-rw-r--r--mm/madvise.c551
-rw-r--r--mm/memblock.c1076
-rw-r--r--mm/memcontrol.c6852
-rw-r--r--mm/memory-failure.c1636
-rw-r--r--mm/memory.c4253
-rw-r--r--mm/memory_hotplug.c1868
-rw-r--r--mm/mempolicy.c2856
-rw-r--r--mm/mempool.c371
-rw-r--r--mm/migrate.c1754
-rw-r--r--mm/mincore.c316
-rw-r--r--mm/mlock.c590
-rw-r--r--mm/mm_init.c159
-rw-r--r--mm/mmap.c3097
-rw-r--r--mm/mmu_context.c62
-rw-r--r--mm/mmu_notifier.c334
-rw-r--r--mm/mmzone.c116
-rw-r--r--mm/mprotect.c419
-rw-r--r--mm/mremap.c559
-rw-r--r--mm/msync.c103
-rw-r--r--mm/nobootmem.c425
-rw-r--r--mm/nommu.c2115
-rw-r--r--mm/oom_kill.c688
-rw-r--r--mm/page-writeback.c2321
-rw-r--r--mm/page_alloc.c6188
-rw-r--r--mm/page_cgroup.c528
-rw-r--r--mm/page_io.c292
-rw-r--r--mm/page_isolation.c259
-rw-r--r--mm/pagewalk.c246
-rw-r--r--mm/percpu-km.c108
-rw-r--r--mm/percpu-vm.c448
-rw-r--r--mm/percpu.c1945
-rw-r--r--mm/pgtable-generic.c173
-rw-r--r--mm/process_vm_access.c483
-rw-r--r--mm/quicklist.c102
-rw-r--r--mm/readahead.c604
-rw-r--r--mm/rmap.c1785
-rw-r--r--mm/shmem.c3017
-rw-r--r--mm/slab.c4630
-rw-r--r--mm/slab.h232
-rw-r--r--mm/slab_common.c466
-rw-r--r--mm/slob.c625
-rw-r--r--mm/slub.c5445
-rw-r--r--mm/sparse-vmemmap.c226
-rw-r--r--mm/sparse.c816
-rw-r--r--mm/swap.c877
-rw-r--r--mm/swap_state.c423
-rw-r--r--mm/swapfile.c2549
-rw-r--r--mm/truncate.c617
-rw-r--r--mm/util.c411
-rw-r--r--mm/vmalloc.c2649
-rw-r--r--mm/vmscan.c3612
-rw-r--r--mm/vmstat.c1412
82 files changed, 94674 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 00000000..3bea74f1
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,473 @@
+config SELECT_MEMORY_MODEL
+ def_bool y
+ depends on ARCH_SELECT_MEMORY_MODEL
+
+choice
+ prompt "Memory model"
+ depends on SELECT_MEMORY_MODEL
+ default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+ default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
+ default FLATMEM_MANUAL
+
+config FLATMEM_MANUAL
+ bool "Flat Memory"
+ depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here: FLATMEM. This is normal
+ and a correct option.
+
+ Some users of more advanced features like NUMA and
+ memory hotplug may have different options here.
+ DISCONTIGMEM is an more mature, better tested system,
+ but is incompatible with memory hotplug and may suffer
+ decreased performance over SPARSEMEM. If unsure between
+ "Sparse Memory" and "Discontiguous Memory", choose
+ "Discontiguous Memory".
+
+ If unsure, choose this option (Flat Memory) over any other.
+
+config DISCONTIGMEM_MANUAL
+ bool "Discontiguous Memory"
+ depends on ARCH_DISCONTIGMEM_ENABLE
+ help
+ This option provides enhanced support for discontiguous
+ memory systems, over FLATMEM. These systems have holes
+ in their physical address spaces, and this option provides
+ more efficient handling of these holes. However, the vast
+ majority of hardware has quite flat address spaces, and
+ can have degraded performance from the extra overhead that
+ this option imposes.
+
+ Many NUMA configurations will have this as the only option.
+
+ If unsure, choose "Flat Memory" over this option.
+
+config SPARSEMEM_MANUAL
+ bool "Sparse Memory"
+ depends on ARCH_SPARSEMEM_ENABLE
+ help
+ This will be the only option for some systems, including
+ memory hotplug systems. This is normal.
+
+ For many other systems, this will be an alternative to
+ "Discontiguous Memory". This option provides some potential
+ performance benefits, along with decreased code complexity,
+ but it is newer, and more experimental.
+
+ If unsure, choose "Discontiguous Memory" or "Flat Memory"
+ over this option.
+
+endchoice
+
+config DISCONTIGMEM
+ def_bool y
+ depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
+
+config SPARSEMEM
+ def_bool y
+ depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
+
+config FLATMEM
+ def_bool y
+ depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+
+config FLAT_NODE_MEM_MAP
+ def_bool y
+ depends on !SPARSEMEM
+
+#
+# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
+# to represent different areas of memory. This variable allows
+# those dependencies to exist individually.
+#
+config NEED_MULTIPLE_NODES
+ def_bool y
+ depends on DISCONTIGMEM || NUMA
+
+config HAVE_MEMORY_PRESENT
+ def_bool y
+ depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
+
+#
+# SPARSEMEM_EXTREME (which is the default) does some bootmem
+# allocations when memory_present() is called. If this cannot
+# be done on your architecture, select this option. However,
+# statically allocating the mem_section[] array can potentially
+# consume vast quantities of .bss, so be careful.
+#
+# This option will also potentially produce smaller runtime code
+# with gcc 3.4 and later.
+#
+config SPARSEMEM_STATIC
+ bool
+
+#
+# Architecture platforms which require a two level mem_section in SPARSEMEM
+# must select this option. This is usually for architecture platforms with
+# an extremely sparse physical address space.
+#
+config SPARSEMEM_EXTREME
+ def_bool y
+ depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+config SPARSEMEM_VMEMMAP_ENABLE
+ bool
+
+config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ def_bool y
+ depends on SPARSEMEM && X86_64
+
+config SPARSEMEM_VMEMMAP
+ bool "Sparse Memory virtual memmap"
+ depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
+ default y
+ help
+ SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
+ pfn_to_page and page_to_pfn operations. This is the most
+ efficient option when sufficient kernel resources are available.
+
+config HAVE_MEMBLOCK
+ boolean
+
+config HAVE_MEMBLOCK_NODE_MAP
+ boolean
+
+config ARCH_DISCARD_MEMBLOCK
+ boolean
+
+config NO_BOOTMEM
+ boolean
+
+config MEMORY_ISOLATION
+ boolean
+
+config MOVABLE_NODE
+ boolean "Enable to assign a node which has only movable memory"
+ depends on HAVE_MEMBLOCK
+ depends on NO_BOOTMEM
+ depends on X86_64
+ depends on NUMA
+ default n
+ help
+ Allow a node to have only movable memory. Pages used by the kernel,
+ such as direct mapping pages cannot be migrated. So the corresponding
+ memory device cannot be hotplugged. This option allows users to
+ online all the memory of a node as movable memory so that the whole
+ node can be hotplugged. Users who don't use the memory hotplug
+ feature are fine with this option on since they don't online memory
+ as movable.
+
+ Say Y here if you want to hotplug a whole node.
+ Say N here if you want kernel to use memory on all nodes evenly.
+
+#
+# Only be set on architectures that have completely implemented memory hotplug
+# feature. If you are not sure, don't touch it.
+#
+config HAVE_BOOTMEM_INFO_NODE
+ def_bool n
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+ bool "Allow for memory hot-add"
+ depends on SPARSEMEM || X86_64_ACPI_NUMA
+ depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
+
+config MEMORY_HOTPLUG_SPARSE
+ def_bool y
+ depends on SPARSEMEM && MEMORY_HOTPLUG
+
+config MEMORY_HOTREMOVE
+ bool "Allow for memory hot remove"
+ select MEMORY_ISOLATION
+ select HAVE_BOOTMEM_INFO_NODE if X86_64
+ depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
+ depends on MIGRATION
+
+#
+# If we have space for more page flags then we can enable additional
+# optimizations and functionality.
+#
+# Regular Sparsemem takes page flag bits for the sectionid if it does not
+# use a virtual memmap. Disable extended page flags for 32 bit platforms
+# that require the use of a sectionid in the page flags.
+#
+config PAGEFLAGS_EXTENDED
+ def_bool y
+ depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "999999" if ARM && !CPU_CACHE_VIPT
+ default "999999" if PARISC && !PA20
+ default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
+ default "4"
+
+#
+# support for memory balloon compaction
+config BALLOON_COMPACTION
+ bool "Allow for balloon memory compaction/migration"
+ def_bool y
+ depends on COMPACTION && VIRTIO_BALLOON
+ help
+ Memory fragmentation introduced by ballooning might reduce
+ significantly the number of 2MB contiguous memory blocks that can be
+ used within a guest, thus imposing performance penalties associated
+ with the reduced number of transparent huge pages that could be used
+ by the guest workload. Allowing the compaction & migration for memory
+ pages enlisted as being part of memory balloon devices avoids the
+ scenario aforementioned and helps improving memory defragmentation.
+
+#
+# support for memory compaction
+config COMPACTION
+ bool "Allow for memory compaction"
+ def_bool y
+ select MIGRATION
+ depends on MMU
+ help
+ Allows the compaction of memory for the allocation of huge pages.
+
+#
+# support for page migration
+#
+config MIGRATION
+ bool "Page migration"
+ def_bool y
+ depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
+ help
+ Allows the migration of the physical location of pages of processes
+ while the virtual addresses are not changed. This is useful in
+ two situations. The first is on NUMA systems to put pages nearer
+ to the processors accessing. The second is when allocating huge
+ pages as migration can relocate pages to satisfy a huge page
+ allocation instead of reclaiming.
+
+config PHYS_ADDR_T_64BIT
+ def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
+
+config ZONE_DMA_FLAG
+ int
+ default "0" if !ZONE_DMA
+ default "1"
+
+config BOUNCE
+ def_bool y
+ depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+
+# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
+# have more than 4GB of memory, but we don't currently use the IOTLB to present
+# a 32-bit address to OHCI. So we need to use a bounce pool instead.
+#
+# We also use the bounce pool to provide stable page writes for jbd. jbd
+# initiates buffer writeback without locking the page or setting PG_writeback,
+# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
+# a major rework effort. Instead, use the bounce buffer to snapshot pages
+# (until jbd goes away). The only jbd user is ext3.
+config NEED_BOUNCE_POOL
+ bool
+ default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
+
+config NR_QUICK
+ int
+ depends on QUICKLIST
+ default "2" if AVR32
+ default "1"
+
+config VIRT_TO_BUS
+ bool
+ help
+ An architecture should select this if it implements the
+ deprecated interface virt_to_bus(). All new architectures
+ should probably not select this.
+
+
+config MMU_NOTIFIER
+ bool
+
+config KSM
+ bool "Enable KSM for page merging"
+ depends on MMU
+ help
+ Enable Kernel Samepage Merging: KSM periodically scans those areas
+ of an application's address space that an app has advised may be
+ mergeable. When it finds pages of identical content, it replaces
+ the many instances by a single page with that content, so
+ saving memory until one or another app needs to modify the content.
+ Recommended for use with KVM, or with other duplicative applications.
+ See Documentation/vm/ksm.txt for more information: KSM is inactive
+ until a program has madvised that an area is MADV_MERGEABLE, and
+ root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
+
+config DEFAULT_MMAP_MIN_ADDR
+ int "Low address space to protect from user allocation"
+ depends on MMU
+ default 4096
+ help
+ This is the portion of low virtual memory which should be protected
+ from userspace allocation. Keeping a user from writing to low pages
+ can help reduce the impact of kernel NULL pointer bugs.
+
+ For most ia64, ppc64 and x86 users with lots of address space
+ a value of 65536 is reasonable and should cause no problems.
+ On arm and other archs it should not be higher than 32768.
+ Programs which use vm86 functionality or have some need to map
+ this low address space will need CAP_SYS_RAWIO or disable this
+ protection by setting the value to 0.
+
+ This value can be changed after boot using the
+ /proc/sys/vm/mmap_min_addr tunable.
+
+config ARCH_SUPPORTS_MEMORY_FAILURE
+ bool
+
+config MEMORY_FAILURE
+ depends on MMU
+ depends on ARCH_SUPPORTS_MEMORY_FAILURE
+ bool "Enable recovery from hardware memory errors"
+ select MEMORY_ISOLATION
+ help
+ Enables code to recover from some memory failures on systems
+ with MCA recovery. This allows a system to continue running
+ even when some of its memory has uncorrected errors. This requires
+ special hardware support and typically ECC memory.
+
+config HWPOISON_INJECT
+ tristate "HWPoison pages injector"
+ depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
+ select PROC_PAGE_MONITOR
+
+config NOMMU_INITIAL_TRIM_EXCESS
+ int "Turn on mmap() excess space trimming before booting"
+ depends on !MMU
+ default 1
+ help
+ The NOMMU mmap() frequently needs to allocate large contiguous chunks
+ of memory on which to store mappings, but it can only ask the system
+ allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
+ more than it requires. To deal with this, mmap() is able to trim off
+ the excess and return it to the allocator.
+
+ If trimming is enabled, the excess is trimmed off and returned to the
+ system allocator, which can cause extra fragmentation, particularly
+ if there are a lot of transient processes.
+
+ If trimming is disabled, the excess is kept, but not used, which for
+ long-term mappings means that the space is wasted.
+
+ Trimming can be dynamically controlled through a sysctl option
+ (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
+ excess pages there must be before trimming should occur, or zero if
+ no trimming is to occur.
+
+ This option specifies the initial value of this option. The default
+ of 1 says that all excess pages should be trimmed.
+
+ See Documentation/nommu-mmap.txt for more information.
+
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select COMPACTION
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
+config CROSS_MEMORY_ATTACH
+ bool "Cross Memory Support"
+ depends on MMU
+ default y
+ help
+ Enabling this option adds the system calls process_vm_readv and
+ process_vm_writev which allow a process with the correct privileges
+ to directly read from or write to to another process's address space.
+ See the man page for more details.
+
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+ depends on !SMP
+ bool
+ default y
+
+config CLEANCACHE
+ bool "Enable cleancache driver to cache clean pages if tmem is present"
+ default n
+ help
+ Cleancache can be thought of as a page-granularity victim cache
+ for clean pages that the kernel's pageframe replacement algorithm
+ (PFRA) would like to keep around, but can't since there isn't enough
+ memory. So when the PFRA "evicts" a page, it first attempts to use
+ cleancache code to put the data contained in that page into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. And when a cleancache-enabled
+ filesystem wishes to access a page in a file on disk, it first
+ checks cleancache to see if it already contains it; if it does,
+ the page is copied into the kernel and a disk access is avoided.
+ When a transcendent memory driver is available (such as zcache or
+ Xen transcendent memory), a significant I/O reduction
+ may be achieved. When none is available, all cleancache calls
+ are reduced to a single pointer-compare-against-NULL resulting
+ in a negligible performance hit.
+
+ If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap to cache swap pages if tmem is present"
+ depends on SWAP
+ default n
+ help
+ Frontswap is so named because it can be thought of as the opposite
+ of a "backing" store for a swap device. The data is stored into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. When space in transcendent memory is available,
+ a significant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit
+ and swap data is stored as normal on the matching swap device.
+
+ If unsure, say Y to enable frontswap.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 00000000..4b244325
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,29 @@
+config DEBUG_PAGEALLOC
+ bool "Debug page memory allocations"
+ depends on DEBUG_KERNEL
+ depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
+ depends on !KMEMCHECK
+ select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ ---help---
+ Unmap pages from the kernel linear mapping after free_pages().
+ This results in a large slowdown, but helps to find certain types
+ of memory corruption.
+
+ For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
+ fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages(). Additionally,
+ this option cannot be enabled in combination with hibernation as
+ that would result in incorrect warnings of memory corruption after
+ a resume because free pages are not saved to the suspend image.
+
+config WANT_PAGE_DEBUG_FLAGS
+ bool
+
+config PAGE_POISONING
+ bool
+ select WANT_PAGE_DEBUG_FLAGS
+
+config PAGE_GUARD
+ bool
+ select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/Makefile b/mm/Makefile
new file mode 100644
index 00000000..3a462875
--- /dev/null
+++ b/mm/Makefile
@@ -0,0 +1,60 @@
+#
+# Makefile for the linux memory manager.
+#
+
+mmu-y := nommu.o
+mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
+ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
+ vmalloc.o pagewalk.o pgtable-generic.o
+
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU) += process_vm_access.o
+endif
+
+obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
+ maccess.o page_alloc.o page-writeback.o \
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
+ util.o mmzone.o vmstat.o backing-dev.o \
+ mm_init.o mmu_context.o percpu.o slab_common.o \
+ compaction.o balloon_compaction.o \
+ interval_tree.o $(mmu-y)
+
+obj-y += init-mm.o
+
+ifdef CONFIG_NO_BOOTMEM
+ obj-y += nobootmem.o
+else
+ obj-y += bootmem.o
+endif
+
+obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
+obj-$(CONFIG_BOUNCE) += bounce.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
+obj-$(CONFIG_HAS_DMA) += dmapool.o
+obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_NUMA) += mempolicy.o
+obj-$(CONFIG_SPARSEMEM) += sparse.o
+obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
+obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+obj-$(CONFIG_SLAB) += slab.o
+obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_FAILSLAB) += failslab.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
+obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 00000000..41733c5d
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,869 @@
+
+#include <linux/wait.h>
+#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+#include <trace/events/writeback.h>
+
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+struct backing_dev_info default_backing_dev_info = {
+ .name = "default",
+ .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
+ .state = 0,
+ .capabilities = BDI_CAP_MAP_COPY,
+};
+EXPORT_SYMBOL_GPL(default_backing_dev_info);
+
+struct backing_dev_info noop_backing_dev_info = {
+ .name = "noop",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
+
+static struct class *bdi_class;
+
+/*
+ * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
+ * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * locking.
+ */
+DEFINE_SPINLOCK(bdi_lock);
+LIST_HEAD(bdi_list);
+LIST_HEAD(bdi_pending_list);
+
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bdi_debug_root;
+
+static void bdi_debug_init(void)
+{
+ bdi_debug_root = debugfs_create_dir("bdi", NULL);
+}
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ struct bdi_writeback *wb = &bdi->wb;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long bdi_thresh;
+ unsigned long nr_dirty, nr_io, nr_more_io;
+ struct inode *inode;
+
+ nr_dirty = nr_io = nr_more_io = 0;
+ spin_lock(&wb->list_lock);
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
+ nr_more_io++;
+ spin_unlock(&wb->list_lock);
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ seq_printf(m,
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiDirtied: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
+ (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ (unsigned long) K(bdi->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state);
+#undef K
+
+ return 0;
+}
+
+static int bdi_debug_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, bdi_debug_stats_show, inode->i_private);
+}
+
+static const struct file_operations bdi_debug_stats_fops = {
+ .open = bdi_debug_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+{
+ bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+ bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
+ bdi, &bdi_debug_stats_fops);
+}
+
+static void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+ debugfs_remove(bdi->debug_stats);
+ debugfs_remove(bdi->debug_dir);
+}
+#else
+static inline void bdi_debug_init(void)
+{
+}
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
+ const char *name)
+{
+}
+static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+}
+#endif
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned long read_ahead_kb;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 10, &read_ahead_kb);
+ if (ret < 0)
+ return ret;
+
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+
+ return count;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *page) \
+{ \
+ struct backing_dev_info *bdi = dev_get_drvdata(dev); \
+ \
+ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+}
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+static ssize_t min_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(min_ratio, bdi->min_ratio)
+
+static ssize_t max_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
+
+static ssize_t stable_pages_required_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ bdi_cap_stable_pages_required(bdi) ? 1 : 0);
+}
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
+static struct device_attribute bdi_dev_attrs[] = {
+ __ATTR_RW(read_ahead_kb),
+ __ATTR_RW(min_ratio),
+ __ATTR_RW(max_ratio),
+ __ATTR_RO(stable_pages_required),
+ __ATTR_NULL,
+};
+
+static __init int bdi_class_init(void)
+{
+ bdi_class = class_create(THIS_MODULE, "bdi");
+ if (IS_ERR(bdi_class))
+ return PTR_ERR(bdi_class);
+
+ bdi_class->dev_attrs = bdi_dev_attrs;
+ bdi_debug_init();
+ return 0;
+}
+postcore_initcall(bdi_class_init);
+
+static int __init default_bdi_init(void)
+{
+ int err;
+
+ err = bdi_init(&default_backing_dev_info);
+ if (!err)
+ bdi_register(&default_backing_dev_info, NULL, "default");
+ err = bdi_init(&noop_backing_dev_info);
+
+ return err;
+}
+subsys_initcall(default_bdi_init);
+
+int bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+ return wb_has_dirty_io(&bdi->wb);
+}
+
+static void wakeup_timer_fn(unsigned long data)
+{
+ struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+
+ spin_lock_bh(&bdi->wb_lock);
+ if (bdi->wb.task) {
+ trace_writeback_wake_thread(bdi);
+ wake_up_process(bdi->wb.task);
+ } else if (bdi->dev) {
+ /*
+ * When bdi tasks are inactive for long time, they are killed.
+ * In this case we have to wake-up the forker thread which
+ * should create and run the bdi thread.
+ */
+ trace_writeback_wake_forker_thread(bdi);
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+ spin_unlock_bh(&bdi->wb_lock);
+}
+
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+ unsigned long timeout;
+
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
+
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+ return max(5UL * 60 * HZ, interval);
+}
+
+/*
+ * Clear pending bit and wakeup anybody waiting for flusher thread creation or
+ * shutdown
+ */
+static void bdi_clear_pending(struct backing_dev_info *bdi)
+{
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
+}
+
+static int bdi_forker_thread(void *ptr)
+{
+ struct bdi_writeback *me = ptr;
+
+ current->flags |= PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(current, 0);
+
+ for (;;) {
+ struct task_struct *task = NULL;
+ struct backing_dev_info *bdi;
+ enum {
+ NO_ACTION, /* Nothing to do */
+ FORK_THREAD, /* Fork bdi thread */
+ KILL_THREAD, /* Kill inactive bdi thread */
+ } action = NO_ACTION;
+
+ /*
+ * Temporary measure, we want to make sure we don't see
+ * dirty data on the default backing_dev_info
+ */
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+ del_timer(&me->wakeup_timer);
+ wb_do_writeback(me, 0);
+ }
+
+ spin_lock_bh(&bdi_lock);
+ /*
+ * In the following loop we are going to check whether we have
+ * some work to do without any synchronization with tasks
+ * waking us up to do work for them. Set the task state here
+ * so that we don't miss wakeups after verifying conditions.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
+ bool have_dirty_io;
+
+ if (!bdi_cap_writeback_dirty(bdi) ||
+ bdi_cap_flush_forker(bdi))
+ continue;
+
+ WARN(!test_bit(BDI_registered, &bdi->state),
+ "bdi %p/%s is not registered!\n", bdi, bdi->name);
+
+ have_dirty_io = !list_empty(&bdi->work_list) ||
+ wb_has_dirty_io(&bdi->wb);
+
+ /*
+ * If the bdi has work to do, but the thread does not
+ * exist - create it.
+ */
+ if (!bdi->wb.task && have_dirty_io) {
+ /*
+ * Set the pending bit - if someone will try to
+ * unregister this bdi - it'll wait on this bit.
+ */
+ set_bit(BDI_pending, &bdi->state);
+ action = FORK_THREAD;
+ break;
+ }
+
+ spin_lock(&bdi->wb_lock);
+
+ /*
+ * If there is no work to do and the bdi thread was
+ * inactive long enough - kill it. The wb_lock is taken
+ * to make sure no-one adds more work to this bdi and
+ * wakes the bdi thread up.
+ */
+ if (bdi->wb.task && !have_dirty_io &&
+ time_after(jiffies, bdi->wb.last_active +
+ bdi_longest_inactive())) {
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock(&bdi->wb_lock);
+ set_bit(BDI_pending, &bdi->state);
+ action = KILL_THREAD;
+ break;
+ }
+ spin_unlock(&bdi->wb_lock);
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ /* Keep working if default bdi still has things to do */
+ if (!list_empty(&me->bdi->work_list))
+ __set_current_state(TASK_RUNNING);
+
+ switch (action) {
+ case FORK_THREAD:
+ __set_current_state(TASK_RUNNING);
+ task = kthread_create(bdi_writeback_thread, &bdi->wb,
+ "flush-%s", dev_name(bdi->dev));
+ if (IS_ERR(task)) {
+ /*
+ * If thread creation fails, force writeout of
+ * the bdi from the thread. Hopefully 1024 is
+ * large enough for efficient IO.
+ */
+ writeback_inodes_wb(&bdi->wb, 1024,
+ WB_REASON_FORKER_THREAD);
+ } else {
+ /*
+ * The spinlock makes sure we do not lose
+ * wake-ups when racing with 'bdi_queue_work()'.
+ * And as soon as the bdi thread is visible, we
+ * can start it.
+ */
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->wb.task = task;
+ spin_unlock_bh(&bdi->wb_lock);
+ wake_up_process(task);
+ }
+ bdi_clear_pending(bdi);
+ break;
+
+ case KILL_THREAD:
+ __set_current_state(TASK_RUNNING);
+ kthread_stop(task);
+ bdi_clear_pending(bdi);
+ break;
+
+ case NO_ACTION:
+ if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+ /*
+ * There are no dirty data. The only thing we
+ * should now care about is checking for
+ * inactive bdi threads and killing them. Thus,
+ * let's sleep for longer time, save energy and
+ * be friendly for battery-driven devices.
+ */
+ schedule_timeout(bdi_longest_inactive());
+ else
+ schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+ try_to_freeze();
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ synchronize_rcu_expedited();
+}
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct device *dev;
+
+ if (bdi->dev) /* The driver needs to use separate queues per device */
+ return 0;
+
+ va_start(args, fmt);
+ dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+ va_end(args);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ bdi->dev = dev;
+
+ /*
+ * Just start the forker thread for our default backing_dev_info,
+ * and add other bdi's to the list. They will get a thread created
+ * on-demand when they need it.
+ */
+ if (bdi_cap_flush_forker(bdi)) {
+ struct bdi_writeback *wb = &bdi->wb;
+
+ wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
+ dev_name(dev));
+ if (IS_ERR(wb->task))
+ return PTR_ERR(wb->task);
+ }
+
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(BDI_registered, &bdi->state);
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
+}
+EXPORT_SYMBOL(bdi_register);
+
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+{
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
+/*
+ * Remove bdi from the global list and shutdown any threads we have running
+ */
+static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+{
+ struct task_struct *task;
+
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ /*
+ * Make sure nobody finds us on the bdi_list anymore
+ */
+ bdi_remove_from_list(bdi);
+
+ /*
+ * If setup is pending, wait for that to complete first
+ */
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ /*
+ * Finally, kill the kernel thread. We don't need to be RCU
+ * safe anymore, since the bdi is gone from visibility.
+ */
+ spin_lock_bh(&bdi->wb_lock);
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock_bh(&bdi->wb_lock);
+
+ if (task)
+ kthread_stop(task);
+}
+
+/*
+ * This bdi is going away now, make sure that no super_blocks point to it
+ */
+static void bdi_prune_sb(struct backing_dev_info *bdi)
+{
+ struct super_block *sb;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdi == bdi)
+ sb->s_bdi = &default_backing_dev_info;
+ }
+ spin_unlock(&sb_lock);
+}
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ struct device *dev = bdi->dev;
+
+ if (dev) {
+ bdi_set_min_ratio(bdi, 0);
+ trace_writeback_bdi_unregister(bdi);
+ bdi_prune_sb(bdi);
+ del_timer_sync(&bdi->wb.wakeup_timer);
+
+ if (!bdi_cap_flush_forker(bdi))
+ bdi_wb_shutdown(bdi);
+ bdi_debug_unregister(bdi);
+
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->dev = NULL;
+ spin_unlock_bh(&bdi->wb_lock);
+
+ device_unregister(dev);
+ }
+}
+EXPORT_SYMBOL(bdi_unregister);
+
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
+ setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
+
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
+int bdi_init(struct backing_dev_info *bdi)
+{
+ int i, err;
+
+ bdi->dev = NULL;
+
+ bdi->min_ratio = 0;
+ bdi->max_ratio = 100;
+ bdi->max_prop_frac = FPROP_FRAC_BASE;
+ spin_lock_init(&bdi->wb_lock);
+ INIT_LIST_HEAD(&bdi->bdi_list);
+ INIT_LIST_HEAD(&bdi->work_list);
+
+ bdi_wb_init(&bdi->wb, bdi);
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0);
+ if (err)
+ goto err;
+ }
+
+ bdi->dirty_exceeded = 0;
+
+ bdi->bw_time_stamp = jiffies;
+ bdi->written_stamp = 0;
+
+ bdi->balanced_dirty_ratelimit = INIT_BW;
+ bdi->dirty_ratelimit = INIT_BW;
+ bdi->write_bandwidth = INIT_BW;
+ bdi->avg_write_bandwidth = INIT_BW;
+
+ err = fprop_local_init_percpu(&bdi->completions);
+
+ if (err) {
+err:
+ while (i--)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ int i;
+
+ /*
+ * Splice our entries to the default_backing_dev_info, if this
+ * bdi disappears
+ */
+ if (bdi_has_dirty_io(bdi)) {
+ struct bdi_writeback *dst = &default_backing_dev_info.wb;
+
+ bdi_lock_two(&bdi->wb, dst);
+ list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
+ list_splice(&bdi->wb.b_io, &dst->b_io);
+ list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
+ }
+
+ bdi_unregister(bdi);
+
+ /*
+ * If bdi_unregister() had already been called earlier, the
+ * wakeup_timer could still be armed because bdi_prune_sb()
+ * can race with the bdi_wakeup_thread_delayed() calls from
+ * __mark_inode_dirty().
+ */
+ del_timer_sync(&bdi->wb.wakeup_timer);
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+ fprop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
+/*
+ * For use from filesystems to quickly init and register a bdi associated
+ * with dirty writeback
+ */
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
+ unsigned int cap)
+{
+ char tmp[32];
+ int err;
+
+ bdi->name = name;
+ bdi->capabilities = cap;
+ err = bdi_init(bdi);
+ if (err)
+ return err;
+
+ sprintf(tmp, "%.28s%s", name, "-%d");
+ err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+ if (err) {
+ bdi_destroy(bdi);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(bdi_setup_and_register);
+
+static wait_queue_head_t congestion_wqh[2] = {
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+ };
+static atomic_t nr_bdi_congested[2];
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ enum bdi_state bit;
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ bit = sync ? BDI_sync_congested : BDI_async_congested;
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
+ smp_mb__after_clear_bit();
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ enum bdi_state bit;
+
+ bit = sync ? BDI_sync_congested : BDI_async_congested;
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion. If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
+
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char kbuf[] = "0\n";
+
+ if (*ppos) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+ return -EFAULT;
+ printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+ table->procname);
+
+ *lenp = 2;
+ *ppos += *lenp;
+ return 2;
+}
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 00000000..07dbc8ec
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
+/*
+ * mm/balloon_compaction.c
+ *
+ * Common interface for making balloon pages movable by compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/balloon_compaction.h>
+
+/*
+ * balloon_devinfo_alloc - allocates a balloon device information descriptor.
+ * @balloon_dev_descriptor: pointer to reference the balloon device which
+ * this struct balloon_dev_info will be servicing.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct balloon_dev_info which will be used to reference a balloon device
+ * as well as to keep track of the balloon device page list.
+ */
+struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
+{
+ struct balloon_dev_info *b_dev_info;
+ b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
+ if (!b_dev_info)
+ return ERR_PTR(-ENOMEM);
+
+ b_dev_info->balloon_device = balloon_dev_descriptor;
+ b_dev_info->mapping = NULL;
+ b_dev_info->isolated_pages = 0;
+ spin_lock_init(&b_dev_info->pages_lock);
+ INIT_LIST_HEAD(&b_dev_info->pages);
+
+ return b_dev_info;
+}
+EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
+
+/*
+ * balloon_page_enqueue - allocates a new page and inserts it into the balloon
+ * page list.
+ * @b_dev_info: balloon device decriptor where we will insert a new page to
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page
+ * before definetively removing it from the guest system.
+ * This function returns the page address for the recently enqueued page or
+ * NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+{
+ unsigned long flags;
+ struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
+ if (!page)
+ return NULL;
+
+ /*
+ * Block others from accessing the 'page' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'page' at this point.
+ */
+ BUG_ON(!trylock_page(page));
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_enqueue);
+
+/*
+ * balloon_page_dequeue - removes a page from balloon's page list and returns
+ * the its address to allow the driver release the page.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ *
+ * Driver must call it to properly de-allocate a previous enlisted balloon page
+ * before definetively releasing it back to the guest system.
+ * This function returns the page address for the recently dequeued page or
+ * NULL in the case we find balloon's page list temporarily empty due to
+ * compaction isolated pages.
+ */
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ bool dequeued_page;
+
+ dequeued_page = false;
+ list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+ /*
+ * Block others from accessing the 'page' while we get around
+ * establishing additional references and preparing the 'page'
+ * to be released by the balloon driver.
+ */
+ if (trylock_page(page)) {
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ /*
+ * Raise the page refcount here to prevent any wrong
+ * attempt to isolate this page, in case of coliding
+ * with balloon_page_isolate() just after we release
+ * the page lock.
+ *
+ * balloon_page_free() will take care of dropping
+ * this extra refcount later.
+ */
+ get_page(page);
+ balloon_page_delete(page);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ dequeued_page = true;
+ break;
+ }
+ }
+
+ if (!dequeued_page) {
+ /*
+ * If we are unable to dequeue a balloon page because the page
+ * list is empty and there is no isolated pages, then something
+ * went out of track and some balloon pages are lost.
+ * BUG() here, otherwise the balloon driver may get stuck into
+ * an infinite loop while attempting to release all its pages.
+ */
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ if (unlikely(list_empty(&b_dev_info->pages) &&
+ !b_dev_info->isolated_pages))
+ BUG();
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ page = NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_dequeue);
+
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
+ * @b_dev_info: holds the balloon device information descriptor.
+ * @a_ops: balloon_mapping address_space_operations descriptor.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct address_space which will be used as the special page->mapping for
+ * balloon device enlisted page instances.
+ */
+struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
+ const struct address_space_operations *a_ops)
+{
+ struct address_space *mapping;
+
+ mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Give a clean 'zeroed' status to all elements of this special
+ * balloon page->mapping struct address_space instance.
+ */
+ address_space_init_once(mapping);
+
+ /*
+ * Set mapping->flags appropriately, to allow balloon pages
+ * ->mapping identification.
+ */
+ mapping_set_balloon(mapping);
+ mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
+
+ /* balloon's page->mapping->a_ops callback descriptor */
+ mapping->a_ops = a_ops;
+
+ /*
+ * Establish a pointer reference back to the balloon device descriptor
+ * this particular page->mapping will be servicing.
+ * This is used by compaction / migration procedures to identify and
+ * access the balloon device pageset while isolating / migrating pages.
+ *
+ * As some balloon drivers can register multiple balloon devices
+ * for a single guest, this also helps compaction / migration to
+ * properly deal with multiple balloon pagesets, when required.
+ */
+ mapping->private_data = b_dev_info;
+ b_dev_info->mapping = mapping;
+
+ return mapping;
+}
+EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
+
+static inline void __isolate_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_del(&page->lru);
+ b_dev_info->isolated_pages++;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline void __putback_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_add(&page->lru, &b_dev_info->pages);
+ b_dev_info->isolated_pages--;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline int __migrate_balloon_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
+}
+
+/* __isolate_lru_page() counterpart for a ballooned page */
+bool balloon_page_isolate(struct page *page)
+{
+ /*
+ * Avoid burning cycles with pages that are yet under __free_pages(),
+ * or just got freed under us.
+ *
+ * In case we 'win' a race for a balloon page being freed under us and
+ * raise its refcount preventing __free_pages() from doing its job
+ * the put_page() at the end of this block will take care of
+ * release this page, thus avoiding a nasty leakage.
+ */
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * As balloon pages are not isolated from LRU lists, concurrent
+ * compaction threads can race against page migration functions
+ * as well as race against the balloon driver releasing a page.
+ *
+ * In order to avoid having an already isolated balloon page
+ * being (wrongly) re-isolated while it is under migration,
+ * or to avoid attempting to isolate pages being released by
+ * the balloon driver, lets be sure we have the page lock
+ * before proceeding with the balloon page isolation steps.
+ */
+ if (likely(trylock_page(page))) {
+ /*
+ * A ballooned page, by default, has just one refcount.
+ * Prevent concurrent compaction threads from isolating
+ * an already isolated balloon page by refcount check.
+ */
+ if (__is_movable_balloon_page(page) &&
+ page_count(page) == 2) {
+ __isolate_balloon_page(page);
+ unlock_page(page);
+ return true;
+ }
+ unlock_page(page);
+ }
+ put_page(page);
+ }
+ return false;
+}
+
+/* putback_lru_page() counterpart for a ballooned page */
+void balloon_page_putback(struct page *page)
+{
+ /*
+ * 'lock_page()' stabilizes the page and prevents races against
+ * concurrent isolation threads attempting to re-isolate it.
+ */
+ lock_page(page);
+
+ if (__is_movable_balloon_page(page)) {
+ __putback_balloon_page(page);
+ /* drop the extra ref count taken for page isolation */
+ put_page(page);
+ } else {
+ WARN_ON(1);
+ dump_page(page);
+ }
+ unlock_page(page);
+}
+
+/* move_to_new_page() counterpart for a ballooned page */
+int balloon_page_migrate(struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct address_space *mapping;
+ int rc = -EAGAIN;
+
+ /*
+ * Block others from accessing the 'newpage' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'newpage' at this point.
+ */
+ BUG_ON(!trylock_page(newpage));
+
+ if (WARN_ON(!__is_movable_balloon_page(page))) {
+ dump_page(page);
+ unlock_page(newpage);
+ return rc;
+ }
+
+ mapping = page->mapping;
+ if (mapping)
+ rc = __migrate_balloon_page(mapping, newpage, page, mode);
+
+ unlock_page(newpage);
+ return rc;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
new file mode 100644
index 00000000..2b0bcb01
--- /dev/null
+++ b/mm/bootmem.c
@@ -0,0 +1,867 @@
+/*
+ * bootmem - A boot-time physical memory allocator and configurator
+ *
+ * Copyright (C) 1999 Ingo Molnar
+ * 1999 Kanoj Sarcar, SGI
+ * 2008 Johannes Weiner
+ *
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
+ */
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/export.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
+
+#include <asm/bug.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+#include "internal.h"
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+ .bdata = &bootmem_node_data[0]
+};
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
+
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
+
+static int bootmem_debug;
+
+static int __init bootmem_debug_setup(char *buf)
+{
+ bootmem_debug = 1;
+ return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
+
+#define bdebug(fmt, args...) ({ \
+ if (unlikely(bootmem_debug)) \
+ printk(KERN_INFO \
+ "bootmem::%s " fmt, \
+ __func__, ## args); \
+})
+
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+ unsigned long bytes = DIV_ROUND_UP(pages, 8);
+
+ return ALIGN(bytes, sizeof(long));
+}
+
+/**
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
+ */
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+{
+ unsigned long bytes = bootmap_bytes(pages);
+
+ return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
+}
+
+/*
+ * link bdata in order
+ */
+static void __init link_bootmem(bootmem_data_t *bdata)
+{
+ bootmem_data_t *ent;
+
+ list_for_each_entry(ent, &bdata_list, list) {
+ if (bdata->node_min_pfn < ent->node_min_pfn) {
+ list_add_tail(&bdata->list, &ent->list);
+ return;
+ }
+ }
+
+ list_add_tail(&bdata->list, &bdata_list);
+}
+
+/*
+ * Called once to set up the allocator itself.
+ */
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
+ unsigned long mapstart, unsigned long start, unsigned long end)
+{
+ unsigned long mapsize;
+
+ mminit_validate_memmodel_limits(&start, &end);
+ bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
+ bdata->node_min_pfn = start;
+ bdata->node_low_pfn = end;
+ link_bootmem(bdata);
+
+ /*
+ * Initially all pages are reserved - setup_arch() has to
+ * register free RAM areas explicitly.
+ */
+ mapsize = bootmap_bytes(end - start);
+ memset(bdata->node_bootmem_map, 0xff, mapsize);
+
+ bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+ bdata - bootmem_node_data, start, mapstart, end, mapsize);
+
+ return mapsize;
+}
+
+/**
+ * init_bootmem_node - register a node as boot memory
+ * @pgdat: node to register
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
+ */
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+ unsigned long startpfn, unsigned long endpfn)
+{
+ return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
+}
+
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+ max_low_pfn = pages;
+ min_low_pfn = start;
+ return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
+
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting physical address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
+{
+ unsigned long cursor, end;
+
+ kmemleak_free_part(__va(physaddr), size);
+
+ cursor = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+{
+ struct page *page;
+ unsigned long start, end, pages, count = 0;
+
+ if (!bdata->node_bootmem_map)
+ return 0;
+
+ start = bdata->node_min_pfn;
+ end = bdata->node_low_pfn;
+
+ bdebug("nid=%td start=%lx end=%lx\n",
+ bdata - bootmem_node_data, start, end);
+
+ while (start < end) {
+ unsigned long *map, idx, vec;
+ unsigned shift;
+
+ map = bdata->node_bootmem_map;
+ idx = start - bdata->node_min_pfn;
+ shift = idx & (BITS_PER_LONG - 1);
+ /*
+ * vec holds at most BITS_PER_LONG map bits,
+ * bit 0 corresponds to start.
+ */
+ vec = ~map[idx / BITS_PER_LONG];
+
+ if (shift) {
+ vec >>= shift;
+ if (end - start >= BITS_PER_LONG)
+ vec |= ~map[idx / BITS_PER_LONG + 1] <<
+ (BITS_PER_LONG - shift);
+ }
+ /*
+ * If we have a properly aligned and fully unreserved
+ * BITS_PER_LONG block of pages in front of us, free
+ * it in one go.
+ */
+ if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
+ int order = ilog2(BITS_PER_LONG);
+
+ __free_pages_bootmem(pfn_to_page(start), order);
+ count += BITS_PER_LONG;
+ start += BITS_PER_LONG;
+ } else {
+ unsigned long cur = start;
+
+ start = ALIGN(start + 1, BITS_PER_LONG);
+ while (vec && cur != start) {
+ if (vec & 1) {
+ page = pfn_to_page(cur);
+ __free_pages_bootmem(page, 0);
+ count++;
+ }
+ vec >>= 1;
+ ++cur;
+ }
+ }
+ }
+
+ page = virt_to_page(bdata->node_bootmem_map);
+ pages = bdata->node_low_pfn - bdata->node_min_pfn;
+ pages = bootmem_bootmap_pages(pages);
+ count += pages;
+ while (pages--)
+ __free_pages_bootmem(page++, 0);
+
+ bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
+
+ return count;
+}
+
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ /*
+ * In free_area_init_core(), highmem zone's managed_pages is set to
+ * present_pages, and bootmem allocator doesn't allocate from highmem
+ * zones. So there's no need to recalculate managed_pages because all
+ * highmem pages will be managed by the buddy system. Here highmem
+ * zone also includes highmem movable zone.
+ */
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ if (!is_highmem(z))
+ z->managed_pages = 0;
+}
+
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+{
+ register_page_bootmem_info_node(pgdat);
+ reset_node_lowmem_managed_pages(pgdat);
+ return free_all_bootmem_core(pgdat->bdata);
+}
+
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+ unsigned long total_pages = 0;
+ bootmem_data_t *bdata;
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_lowmem_managed_pages(pgdat);
+
+ list_for_each_entry(bdata, &bdata_list, list)
+ total_pages += free_all_bootmem_core(bdata);
+
+ return total_pages;
+}
+
+static void __init __free(bootmem_data_t *bdata,
+ unsigned long sidx, unsigned long eidx)
+{
+ unsigned long idx;
+
+ bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn);
+
+ if (bdata->hint_idx > sidx)
+ bdata->hint_idx = sidx;
+
+ for (idx = sidx; idx < eidx; idx++)
+ if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
+ BUG();
+}
+
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
+ unsigned long eidx, int flags)
+{
+ unsigned long idx;
+ int exclusive = flags & BOOTMEM_EXCLUSIVE;
+
+ bdebug("nid=%td start=%lx end=%lx flags=%x\n",
+ bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn,
+ flags);
+
+ for (idx = sidx; idx < eidx; idx++)
+ if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+ if (exclusive) {
+ __free(bdata, sidx, idx);
+ return -EBUSY;
+ }
+ bdebug("silent double reserve of PFN %lx\n",
+ idx + bdata->node_min_pfn);
+ }
+ return 0;
+}
+
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
+ unsigned long start, unsigned long end,
+ int reserve, int flags)
+{
+ unsigned long sidx, eidx;
+
+ bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
+ bdata - bootmem_node_data, start, end, reserve, flags);
+
+ BUG_ON(start < bdata->node_min_pfn);
+ BUG_ON(end > bdata->node_low_pfn);
+
+ sidx = start - bdata->node_min_pfn;
+ eidx = end - bdata->node_min_pfn;
+
+ if (reserve)
+ return __reserve(bdata, sidx, eidx, flags);
+ else
+ __free(bdata, sidx, eidx);
+ return 0;
+}
+
+static int __init mark_bootmem(unsigned long start, unsigned long end,
+ int reserve, int flags)
+{
+ unsigned long pos;
+ bootmem_data_t *bdata;
+
+ pos = start;
+ list_for_each_entry(bdata, &bdata_list, list) {
+ int err;
+ unsigned long max;
+
+ if (pos < bdata->node_min_pfn ||
+ pos >= bdata->node_low_pfn) {
+ BUG_ON(pos != start);
+ continue;
+ }
+
+ max = min(bdata->node_low_pfn, end);
+
+ err = mark_bootmem_node(bdata, pos, max, reserve, flags);
+ if (reserve && err) {
+ mark_bootmem(start, pos, 0, 0);
+ return err;
+ }
+
+ if (max == end)
+ return 0;
+ pos = bdata->node_low_pfn;
+ }
+ BUG();
+}
+
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size)
+{
+ unsigned long start, end;
+
+ kmemleak_free_part(__va(physaddr), size);
+
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
+
+ mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
+}
+
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting physical address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long physaddr, unsigned long size)
+{
+ unsigned long start, end;
+
+ kmemleak_free_part(__va(physaddr), size);
+
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
+
+ mark_bootmem(start, end, 0, 0);
+}
+
+/**
+ * reserve_bootmem_node - mark a page range as reserved
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size, int flags)
+{
+ unsigned long start, end;
+
+ start = PFN_DOWN(physaddr);
+ end = PFN_UP(physaddr + size);
+
+ return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
+}
+
+/**
+ * reserve_bootmem - mark a page range as reserved
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+ int flags)
+{
+ unsigned long start, end;
+
+ start = PFN_DOWN(addr);
+ end = PFN_UP(addr + size);
+
+ return mark_bootmem(start, end, 1, flags);
+}
+
+static unsigned long __init align_idx(struct bootmem_data *bdata,
+ unsigned long idx, unsigned long step)
+{
+ unsigned long base = bdata->node_min_pfn;
+
+ /*
+ * Align the index with respect to the node start so that the
+ * combination of both satisfies the requested alignment.
+ */
+
+ return ALIGN(base + idx, step) - base;
+}
+
+static unsigned long __init align_off(struct bootmem_data *bdata,
+ unsigned long off, unsigned long align)
+{
+ unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+
+ /* Same as align_idx for byte offsets */
+
+ return ALIGN(base + off, align) - base;
+}
+
+static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ unsigned long fallback = 0;
+ unsigned long min, max, start, sidx, midx, step;
+
+ bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+ bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ align, goal, limit);
+
+ BUG_ON(!size);
+ BUG_ON(align & (align - 1));
+ BUG_ON(limit && goal + size > limit);
+
+ if (!bdata->node_bootmem_map)
+ return NULL;
+
+ min = bdata->node_min_pfn;
+ max = bdata->node_low_pfn;
+
+ goal >>= PAGE_SHIFT;
+ limit >>= PAGE_SHIFT;
+
+ if (limit && max > limit)
+ max = limit;
+ if (max <= min)
+ return NULL;
+
+ step = max(align >> PAGE_SHIFT, 1UL);
+
+ if (goal && min < goal && goal < max)
+ start = ALIGN(goal, step);
+ else
+ start = ALIGN(min, step);
+
+ sidx = start - bdata->node_min_pfn;
+ midx = max - bdata->node_min_pfn;
+
+ if (bdata->hint_idx > sidx) {
+ /*
+ * Handle the valid case of sidx being zero and still
+ * catch the fallback below.
+ */
+ fallback = sidx + 1;
+ sidx = align_idx(bdata, bdata->hint_idx, step);
+ }
+
+ while (1) {
+ int merge;
+ void *region;
+ unsigned long eidx, i, start_off, end_off;
+find_block:
+ sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+ sidx = align_idx(bdata, sidx, step);
+ eidx = sidx + PFN_UP(size);
+
+ if (sidx >= midx || eidx > midx)
+ break;
+
+ for (i = sidx; i < eidx; i++)
+ if (test_bit(i, bdata->node_bootmem_map)) {
+ sidx = align_idx(bdata, i, step);
+ if (sidx == i)
+ sidx += step;
+ goto find_block;
+ }
+
+ if (bdata->last_end_off & (PAGE_SIZE - 1) &&
+ PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+ start_off = align_off(bdata, bdata->last_end_off, align);
+ else
+ start_off = PFN_PHYS(sidx);
+
+ merge = PFN_DOWN(start_off) < sidx;
+ end_off = start_off + size;
+
+ bdata->last_end_off = end_off;
+ bdata->hint_idx = PFN_UP(end_off);
+
+ /*
+ * Reserve the area now:
+ */
+ if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+ PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+ BUG();
+
+ region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+ start_off);
+ memset(region, 0, size);
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(region, size, 0, 0);
+ return region;
+ }
+
+ if (fallback) {
+ sidx = align_idx(bdata, fallback - 1, step);
+ fallback = 0;
+ goto find_block;
+ }
+
+ return NULL;
+}
+
+static void * __init alloc_bootmem_core(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ bootmem_data_t *bdata;
+ void *region;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+ list_for_each_entry(bdata, &bdata_list, list) {
+ if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+ continue;
+ if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+ break;
+
+ region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
+ if (region)
+ return region;
+ }
+
+ return NULL;
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+restart:
+ ptr = alloc_bootmem_core(size, align, goal, limit);
+ if (ptr)
+ return ptr;
+ if (goal) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = 0;
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+
+ if (mem)
+ return mem;
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = 0;
+
+ return ___alloc_bootmem(size, align, goal, limit);
+}
+
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+again:
+
+ /* do not panic in alloc_bootmem_bdata() */
+ if (limit && goal + size > limit)
+ limit = 0;
+
+ ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ ptr = alloc_bootmem_core(size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ if (goal) {
+ goal = 0;
+ goto again;
+ }
+
+ return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ if (ptr)
+ return ptr;
+
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+ unsigned long end_pfn;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ /* update goal according ...MAX_DMA32_PFN */
+ end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+ (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+ void *ptr;
+ unsigned long new_goal;
+
+ new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+ ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
+ new_goal, 0);
+ if (ptr)
+ return ptr;
+ }
+#endif
+
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+
+}
+
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
+#endif
+
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
+
+void * __init __alloc_bootmem_low_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem_nopanic(size, align, goal,
+ ARCH_LOW_ADDRESS_LIMIT);
+}
+
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node(pgdat, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+}
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 00000000..5f890176
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,341 @@
+/* bounce buffer handling for block devices
+ *
+ * - Split from highmem.c
+ */
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <asm/tlbflush.h>
+
+#include <trace/events/block.h>
+
+#define POOL_SIZE 64
+#define ISA_POOL_SIZE 16
+
+static mempool_t *page_pool, *isa_page_pool;
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
+static __init int init_emergency_pool(void)
+{
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
+ if (max_pfn <= max_low_pfn)
+ return 0;
+#endif
+
+ page_pool = mempool_create_page_pool(POOL_SIZE, 0);
+ BUG_ON(!page_pool);
+ printk("bounce pool size: %d pages\n", POOL_SIZE);
+
+ return 0;
+}
+
+__initcall(init_emergency_pool);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+ unsigned long flags;
+ unsigned char *vto;
+
+ local_irq_save(flags);
+ vto = kmap_atomic(to->bv_page);
+ memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+ kunmap_atomic(vto);
+ local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom) \
+ memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * allocate pages in the DMA region for the ISA pool
+ */
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
+{
+ return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
+}
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+ if (isa_page_pool)
+ return 0;
+
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+ mempool_free_pages, (void *) 0);
+ BUG_ON(!isa_page_pool);
+
+ printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+ return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+ unsigned char *vfrom;
+ struct bio_vec *tovec, *fromvec;
+ int i;
+
+ __bio_for_each_segment(tovec, to, i, 0) {
+ fromvec = from->bi_io_vec + i;
+
+ /*
+ * not bounced
+ */
+ if (tovec->bv_page == fromvec->bv_page)
+ continue;
+
+ /*
+ * fromvec->bv_offset and fromvec->bv_len might have been
+ * modified by the block layer, so use the original copy,
+ * bounce_copy_vec already uses tovec->bv_len
+ */
+ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+ bounce_copy_vec(tovec, vfrom);
+ flush_dcache_page(tovec->bv_page);
+ }
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+ struct bio_vec *bvec, *org_vec;
+ int i;
+
+ if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+ set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+ /*
+ * free up bounce indirect pages used
+ */
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ org_vec = bio_orig->bi_io_vec + i;
+ if (bvec->bv_page == org_vec->bv_page)
+ continue;
+
+ dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+ mempool_free(bvec->bv_page, pool);
+ }
+
+ bio_endio(bio_orig, err);
+ bio_put(bio);
+}
+
+static void bounce_end_io_write(struct bio *bio, int err)
+{
+ bounce_end_io(bio, page_pool, err);
+}
+
+static void bounce_end_io_write_isa(struct bio *bio, int err)
+{
+
+ bounce_end_io(bio, isa_page_pool, err);
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ copy_to_high_bio_irq(bio_orig, bio);
+
+ bounce_end_io(bio, pool, err);
+}
+
+static void bounce_end_io_read(struct bio *bio, int err)
+{
+ __bounce_end_io_read(bio, page_pool, err);
+}
+
+static void bounce_end_io_read_isa(struct bio *bio, int err)
+{
+ __bounce_end_io_read(bio, isa_page_pool, err);
+}
+
+#ifdef CONFIG_NEED_BOUNCE_POOL
+static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+{
+ struct page *page;
+ struct backing_dev_info *bdi;
+ struct address_space *mapping;
+ struct bio_vec *from;
+ int i;
+
+ if (bio_data_dir(bio) != WRITE)
+ return 0;
+
+ if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
+ return 0;
+
+ /*
+ * Based on the first page that has a valid mapping, decide whether or
+ * not we have to employ bounce buffering to guarantee stable pages.
+ */
+ bio_for_each_segment(from, bio, i) {
+ page = from->bv_page;
+ mapping = page_mapping(page);
+ if (!mapping)
+ continue;
+ bdi = mapping->backing_dev_info;
+ return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
+ }
+
+ return 0;
+}
+#else
+static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+{
+ return 0;
+}
+#endif /* CONFIG_NEED_BOUNCE_POOL */
+
+static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
+ mempool_t *pool, int force)
+{
+ struct page *page;
+ struct bio *bio = NULL;
+ int i, rw = bio_data_dir(*bio_orig);
+ struct bio_vec *to, *from;
+
+ bio_for_each_segment(from, *bio_orig, i) {
+ page = from->bv_page;
+
+ /*
+ * is destination page below bounce pfn?
+ */
+ if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+ continue;
+
+ /*
+ * irk, bounce it
+ */
+ if (!bio) {
+ unsigned int cnt = (*bio_orig)->bi_vcnt;
+
+ bio = bio_alloc(GFP_NOIO, cnt);
+ memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
+ }
+
+
+ to = bio->bi_io_vec + i;
+
+ to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ inc_zone_page_state(to->bv_page, NR_BOUNCE);
+
+ if (rw == WRITE) {
+ char *vto, *vfrom;
+
+ flush_dcache_page(from->bv_page);
+ vto = page_address(to->bv_page) + to->bv_offset;
+ vfrom = kmap(from->bv_page) + from->bv_offset;
+ memcpy(vto, vfrom, to->bv_len);
+ kunmap(from->bv_page);
+ }
+ }
+
+ /*
+ * no pages bounced
+ */
+ if (!bio)
+ return;
+
+ trace_block_bio_bounce(q, *bio_orig);
+
+ /*
+ * at least one page was bounced, fill in possible non-highmem
+ * pages
+ */
+ __bio_for_each_segment(from, *bio_orig, i, 0) {
+ to = bio_iovec_idx(bio, i);
+ if (!to->bv_page) {
+ to->bv_page = from->bv_page;
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ }
+ }
+
+ bio->bi_bdev = (*bio_orig)->bi_bdev;
+ bio->bi_flags |= (1 << BIO_BOUNCED);
+ bio->bi_sector = (*bio_orig)->bi_sector;
+ bio->bi_rw = (*bio_orig)->bi_rw;
+
+ bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+ bio->bi_idx = (*bio_orig)->bi_idx;
+ bio->bi_size = (*bio_orig)->bi_size;
+
+ if (pool == page_pool) {
+ bio->bi_end_io = bounce_end_io_write;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read;
+ } else {
+ bio->bi_end_io = bounce_end_io_write_isa;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read_isa;
+ }
+
+ bio->bi_private = *bio_orig;
+ *bio_orig = bio;
+}
+
+void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+{
+ int must_bounce;
+ mempool_t *pool;
+
+ /*
+ * Data-less bio, nothing to bounce
+ */
+ if (!bio_has_data(*bio_orig))
+ return;
+
+ must_bounce = must_snapshot_stable_pages(q, *bio_orig);
+
+ /*
+ * for non-isa bounce case, just check if the bounce pfn is equal
+ * to or bigger than the highest pfn in the system -- in that case,
+ * don't waste time iterating over bio segments
+ */
+ if (!(q->bounce_gfp & GFP_DMA)) {
+ if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
+ return;
+ pool = page_pool;
+ } else {
+ BUG_ON(!isa_page_pool);
+ pool = isa_page_pool;
+ }
+
+ /*
+ * slow path
+ */
+ __blk_queue_bounce(q, bio_orig, pool, must_bounce);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 00000000..d76ba74b
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,220 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache. See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/cleancache.h>
+
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/invalidate even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled __read_mostly;
+EXPORT_SYMBOL(cleancache_enabled);
+
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops __read_mostly;
+
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured. These are for information only so are not protected
+ * against increment races.
+ */
+static u64 cleancache_succ_gets;
+static u64 cleancache_failed_gets;
+static u64 cleancache_puts;
+static u64 cleancache_invalidates;
+
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+ struct cleancache_ops old = cleancache_ops;
+
+ cleancache_ops = *ops;
+ cleancache_enabled = 1;
+ return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+ sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+ sb->cleancache_poolid =
+ (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+ struct cleancache_filekey *key)
+{
+ int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
+ int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+ struct super_block *sb = inode->i_sb;
+
+ key->u.ino = inode->i_ino;
+ if (sb->s_export_op != NULL) {
+ fhfn = sb->s_export_op->encode_fh;
+ if (fhfn) {
+ len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
+ if (len <= FILEID_ROOT || len == FILEID_INVALID)
+ return -1;
+ if (maxlen > CLEANCACHE_KEY_MAX)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+ int ret = -1;
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
+ goto out;
+
+ if (cleancache_get_key(page->mapping->host, &key) < 0)
+ goto out;
+
+ ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+ if (ret == 0)
+ cleancache_succ_gets++;
+ else
+ cleancache_failed_gets++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index. Page must be locked. Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+ int pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id >= 0 &&
+ cleancache_get_key(page->mapping->host, &key) >= 0) {
+ (*cleancache_ops.put_page)(pool_id, key, page->index, page);
+ cleancache_puts++;
+ }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+
+/*
+ * Invalidate any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_invalidate_page(struct address_space *mapping,
+ struct page *page)
+{
+ /* careful... page->mapping is NULL sometimes when this is called */
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (pool_id >= 0) {
+ VM_BUG_ON(!PageLocked(page));
+ if (cleancache_get_key(mapping->host, &key) >= 0) {
+ (*cleancache_ops.invalidate_page)(pool_id,
+ key, page->index);
+ cleancache_invalidates++;
+ }
+ }
+}
+EXPORT_SYMBOL(__cleancache_invalidate_page);
+
+/*
+ * Invalidate all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_invalidate_inode(struct address_space *mapping)
+{
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+ (*cleancache_ops.invalidate_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_invalidate_inode);
+
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_invalidate_fs(struct super_block *sb)
+{
+ if (sb->cleancache_poolid >= 0) {
+ int old_poolid = sb->cleancache_poolid;
+ sb->cleancache_poolid = -1;
+ (*cleancache_ops.invalidate_fs)(old_poolid);
+ }
+}
+EXPORT_SYMBOL(__cleancache_invalidate_fs);
+
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("cleancache", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets);
+ debugfs_create_u64("failed_gets", S_IRUGO,
+ root, &cleancache_failed_gets);
+ debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &cleancache_invalidates);
+#endif
+ return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 00000000..05ccb4cc
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,1209 @@
+/*
+ * linux/mm/compaction.c
+ *
+ * Memory compaction for the reduction of external fragmentation. Note that
+ * this heavily depends upon page migration to do all the real heavy
+ * lifting
+ *
+ * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
+ */
+#include <linux/swap.h>
+#include <linux/migrate.h>
+#include <linux/compaction.h>
+#include <linux/mm_inline.h>
+#include <linux/backing-dev.h>
+#include <linux/sysctl.h>
+#include <linux/sysfs.h>
+#include <linux/balloon_compaction.h>
+#include <linux/page-isolation.h>
+#include "internal.h"
+
+#ifdef CONFIG_COMPACTION
+static inline void count_compact_event(enum vm_event_item item)
+{
+ count_vm_event(item);
+}
+
+static inline void count_compact_events(enum vm_event_item item, long delta)
+{
+ count_vm_events(item, delta);
+}
+#else
+#define count_compact_event(item) do { } while (0)
+#define count_compact_events(item, delta) do { } while (0)
+#endif
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
+static unsigned long release_freepages(struct list_head *freelist)
+{
+ struct page *page, *next;
+ unsigned long count = 0;
+
+ list_for_each_entry_safe(page, next, freelist, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ count++;
+ }
+
+ return count;
+}
+
+static void map_pages(struct list_head *list)
+{
+ struct page *page;
+
+ list_for_each_entry(page, list, lru) {
+ arch_alloc_page(page, 0);
+ kernel_map_pages(page, 1, 1);
+ }
+}
+
+static inline bool migrate_async_suitable(int migratetype)
+{
+ return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
+}
+
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ if (cc->ignore_skip_hint)
+ return true;
+
+ return !get_pageblock_skip(page);
+}
+
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long pfn;
+
+ zone->compact_cached_migrate_pfn = start_pfn;
+ zone->compact_cached_free_pfn = end_pfn;
+ zone->compact_blockskip_flush = false;
+
+ /* Walk the zone and mark every pageblock as suitable for isolation */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *page;
+
+ cond_resched();
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ continue;
+
+ clear_pageblock_skip(page);
+ }
+}
+
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Only flush if a full compaction finished recently */
+ if (zone->compact_blockskip_flush)
+ __reset_isolation_suitable(zone);
+ }
+}
+
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool migrate_scanner)
+{
+ struct zone *zone = cc->zone;
+ if (!page)
+ return;
+
+ if (!nr_isolated) {
+ unsigned long pfn = page_to_pfn(page);
+ set_pageblock_skip(page);
+
+ /* Update where compaction should restart */
+ if (migrate_scanner) {
+ if (!cc->finished_update_migrate &&
+ pfn > zone->compact_cached_migrate_pfn)
+ zone->compact_cached_migrate_pfn = pfn;
+ } else {
+ if (!cc->finished_update_free &&
+ pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
+ }
+ }
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ return true;
+}
+
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool migrate_scanner)
+{
+}
+#endif /* CONFIG_COMPACTION */
+
+static inline bool should_release_lock(spinlock_t *lock)
+{
+ return need_resched() || spin_is_contended(lock);
+}
+
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+ bool locked, struct compact_control *cc)
+{
+ if (should_release_lock(lock)) {
+ if (locked) {
+ spin_unlock_irqrestore(lock, *flags);
+ locked = false;
+ }
+
+ /* async aborts if taking too long or contended */
+ if (!cc->sync) {
+ cc->contended = true;
+ return false;
+ }
+
+ cond_resched();
+ }
+
+ if (!locked)
+ spin_lock_irqsave(lock, *flags);
+ return true;
+}
+
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+ unsigned long *flags, struct compact_control *cc)
+{
+ return compact_checklock_irqsave(lock, flags, false, cc);
+}
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+ int migratetype = get_pageblock_migratetype(page);
+
+ /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ if (migratetype == MIGRATE_RESERVE)
+ return false;
+
+ if (is_migrate_isolate(migratetype))
+ return false;
+
+ /* If the page is a large free page, then allow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return true;
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (migrate_async_suitable(migratetype))
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
+/*
+ * Isolate free pages onto a private freelist. Caller must hold zone->lock.
+ * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
+ * pages inside of the pageblock (even though it may still end up isolating
+ * some pages).
+ */
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+ unsigned long blockpfn,
+ unsigned long end_pfn,
+ struct list_head *freelist,
+ bool strict)
+{
+ int nr_scanned = 0, total_isolated = 0;
+ struct page *cursor, *valid_page = NULL;
+ unsigned long nr_strict_required = end_pfn - blockpfn;
+ unsigned long flags;
+ bool locked = false;
+
+ cursor = pfn_to_page(blockpfn);
+
+ /* Isolate free pages. */
+ for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ int isolated, i;
+ struct page *page = cursor;
+
+ nr_scanned++;
+ if (!pfn_valid_within(blockpfn))
+ continue;
+ if (!valid_page)
+ valid_page = page;
+ if (!PageBuddy(page))
+ continue;
+
+ /*
+ * The zone lock must be held to isolate freepages.
+ * Unfortunately this is a very coarse lock and can be
+ * heavily contended if there are parallel allocations
+ * or parallel compactions. For async compaction do not
+ * spin on the lock and we acquire the lock as late as
+ * possible.
+ */
+ locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+ locked, cc);
+ if (!locked)
+ break;
+
+ /* Recheck this is a suitable migration target under lock */
+ if (!strict && !suitable_migration_target(page))
+ break;
+
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
+ continue;
+
+ /* Found a free page, break it into order-0 pages */
+ isolated = split_free_page(page);
+ if (!isolated && strict)
+ break;
+ total_isolated += isolated;
+ for (i = 0; i < isolated; i++) {
+ list_add(&page->lru, freelist);
+ page++;
+ }
+
+ /* If a page was split, advance to the end of it */
+ if (isolated) {
+ blockpfn += isolated - 1;
+ cursor += isolated - 1;
+ }
+ }
+
+ trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+
+ /*
+ * If strict isolation is requested by CMA then check that all the
+ * pages requested were isolated. If there were any failures, 0 is
+ * returned and CMA will fail.
+ */
+ if (strict && nr_strict_required > total_isolated)
+ total_isolated = 0;
+
+ if (locked)
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (blockpfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, total_isolated, false);
+
+ count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+ if (total_isolated)
+ count_compact_events(COMPACTISOLATED, total_isolated);
+ return total_isolated;
+}
+
+/**
+ * isolate_freepages_range() - isolate free pages.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn: The one-past-last PFN.
+ *
+ * Non-free pages, invalid PFNs, or zone boundaries within the
+ * [start_pfn, end_pfn) range are considered errors, cause function to
+ * undo its actions and return zero.
+ *
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater then end_pfn if end fell in a middle of
+ * a free page).
+ */
+unsigned long
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long isolated, pfn, block_end_pfn;
+ LIST_HEAD(freelist);
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
+ if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
+ break;
+
+ /*
+ * On subsequent iterations ALIGN() is actually not needed,
+ * but we keep it that we not to complicate the code.
+ */
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
+ &freelist, true);
+
+ /*
+ * In strict mode, isolate_freepages_block() returns 0 if
+ * there are any holes in the block (ie. invalid PFNs or
+ * non-free pages).
+ */
+ if (!isolated)
+ break;
+
+ /*
+ * If we managed to isolate pages, it is always (1 << n) *
+ * pageblock_nr_pages for some non-negative n. (Max order
+ * page may span two pageblocks).
+ */
+ }
+
+ /* split_free_page does not map the pages */
+ map_pages(&freelist);
+
+ if (pfn < end_pfn) {
+ /* Loop terminated early, cleanup. */
+ release_freepages(&freelist);
+ return 0;
+ }
+
+ /* We don't use freelists for anything. */
+ return pfn;
+}
+
+/* Update the number of anon and file isolated pages in the zone */
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
+{
+ struct page *page;
+ unsigned int count[2] = { 0, };
+
+ list_for_each_entry(page, &cc->migratepages, lru)
+ count[!!page_is_file_cache(page)]++;
+
+ /* If locked we can use the interrupt unsafe versions */
+ if (locked) {
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+ } else {
+ mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+ }
+}
+
+/* Similar to reclaim, but different enough that they don't share logic */
+static bool too_many_isolated(struct zone *zone)
+{
+ unsigned long active, inactive, isolated;
+
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+ active = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
+ zone_page_state(zone, NR_ISOLATED_ANON);
+
+ return isolated > (inactive + active) / 2;
+}
+
+/**
+ * isolate_migratepages_range() - isolate all migrate-able pages in range.
+ * @zone: Zone pages are in.
+ * @cc: Compaction control structure.
+ * @low_pfn: The first PFN of the range.
+ * @end_pfn: The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
+ *
+ * Isolate all pages that can be migrated from the range specified by
+ * [low_pfn, end_pfn). Returns zero if there is a fatal signal
+ * pending), otherwise PFN of the first page that was not scanned
+ * (which may be both less, equal to or more then end_pfn).
+ *
+ * Assumes that cc->migratepages is empty and cc->nr_migratepages is
+ * zero.
+ *
+ * Apart from cc->migratepages and cc->nr_migratetypes this function
+ * does not modify any cc's fields, in particular it does not modify
+ * (or read for that matter) cc->migrate_pfn.
+ */
+unsigned long
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
+{
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
+ struct list_head *migratelist = &cc->migratepages;
+ isolate_mode_t mode = 0;
+ struct lruvec *lruvec;
+ unsigned long flags;
+ bool locked = false;
+ struct page *page = NULL, *valid_page = NULL;
+
+ /*
+ * Ensure that there are not too many pages isolated from the LRU
+ * list by either parallel reclaimers or compaction. If there are,
+ * delay for some time until fewer pages are isolated
+ */
+ while (unlikely(too_many_isolated(zone))) {
+ /* async migration should just abort */
+ if (!cc->sync)
+ return 0;
+
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ if (fatal_signal_pending(current))
+ return 0;
+ }
+
+ /* Time to isolate some pages for migration */
+ cond_resched();
+ for (; low_pfn < end_pfn; low_pfn++) {
+ /* give a chance to irqs before checking need_resched() */
+ if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ if (should_release_lock(&zone->lru_lock)) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
+ }
+
+ /*
+ * migrate_pfn does not necessarily start aligned to a
+ * pageblock. Ensure that pfn_valid is called when moving
+ * into a new MAX_ORDER_NR_PAGES range in case of large
+ * memory holes within the zone
+ */
+ if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if (!pfn_valid(low_pfn)) {
+ low_pfn += MAX_ORDER_NR_PAGES - 1;
+ continue;
+ }
+ }
+
+ if (!pfn_valid_within(low_pfn))
+ continue;
+ nr_scanned++;
+
+ /*
+ * Get the page and ensure the page is within the same zone.
+ * See the comment in isolate_freepages about overlapping
+ * nodes. It is deliberate that the new zone lock is not taken
+ * as memory compaction should not move pages between nodes.
+ */
+ page = pfn_to_page(low_pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+ if (!valid_page)
+ valid_page = page;
+
+ /* If isolation recently failed, do not retry */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
+ /* Skip if free */
+ if (PageBuddy(page))
+ continue;
+
+ /*
+ * For async migration, also only scan in MOVABLE blocks. Async
+ * migration is optimistic to see if the minimum amount of work
+ * satisfies the allocation
+ */
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+ !migrate_async_suitable(get_pageblock_migratetype(page))) {
+ cc->finished_update_migrate = true;
+ goto next_pageblock;
+ }
+
+ /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU pages and balloon pages
+ * Skip any other type of page
+ */
+ if (!PageLRU(page)) {
+ if (unlikely(balloon_page_movable(page))) {
+ if (locked && balloon_page_isolate(page)) {
+ /* Successfully isolated */
+ cc->finished_update_migrate = true;
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+ goto check_compact_cluster;
+ }
+ }
+ continue;
+ }
+
+ /*
+ * PageLRU is set. lru_lock normally excludes isolation
+ * splitting and collapsing (collapsing has already happened
+ * if PageLRU is set) but the lock is not necessarily taken
+ * here and it is wasteful to take it just to check transhuge.
+ * Check TransHuge without lock and skip the whole pageblock if
+ * it's either a transhuge or hugetlbfs page, as calling
+ * compound_order() without preventing THP from splitting the
+ * page underneath us may return surprising results.
+ */
+ if (PageTransHuge(page)) {
+ if (!locked)
+ goto next_pageblock;
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
+ /* Check if it is ok to still hold the lock */
+ locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ locked, cc);
+ if (!locked || fatal_signal_pending(current))
+ break;
+
+ /* Recheck PageLRU and PageTransHuge under lock */
+ if (!PageLRU(page))
+ continue;
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
+ if (!cc->sync)
+ mode |= ISOLATE_ASYNC_MIGRATE;
+
+ if (unevictable)
+ mode |= ISOLATE_UNEVICTABLE;
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ /* Try isolate the page */
+ if (__isolate_lru_page(page, mode) != 0)
+ continue;
+
+ VM_BUG_ON(PageTransCompound(page));
+
+ /* Successfully isolated */
+ cc->finished_update_migrate = true;
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+
+check_compact_cluster:
+ /* Avoid isolating too much */
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ ++low_pfn;
+ break;
+ }
+
+ continue;
+
+next_pageblock:
+ low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
+ }
+
+ acct_isolated(zone, locked, cc);
+
+ if (locked)
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (low_pfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, nr_isolated, true);
+
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
+ count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ if (nr_isolated)
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+
+ return low_pfn;
+}
+
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+#ifdef CONFIG_COMPACTION
+/*
+ * Based on information in the current compact_control, find blocks
+ * suitable for isolating free pages from and then isolate them.
+ */
+static void isolate_freepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ struct page *page;
+ unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
+ int nr_freepages = cc->nr_freepages;
+ struct list_head *freelist = &cc->freepages;
+
+ /*
+ * Initialise the free scanner. The starting point is where we last
+ * scanned from (or the end of the zone if starting). The low point
+ * is the end of the pageblock the migration scanner is using.
+ */
+ pfn = cc->free_pfn;
+ low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+
+ /*
+ * Take care that if the migration scanner is at the end of the zone
+ * that the free scanner does not accidentally move to the next zone
+ * in the next isolation cycle.
+ */
+ high_pfn = min(low_pfn, pfn);
+
+ z_end_pfn = zone_end_pfn(zone);
+
+ /*
+ * Isolate free pages until enough are available to migrate the
+ * pages on cc->migratepages. We stop searching if the migrate
+ * and free page scanners meet or enough free pages are isolated.
+ */
+ for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ pfn -= pageblock_nr_pages) {
+ unsigned long isolated;
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ /*
+ * Check for overlapping nodes/zones. It's possible on some
+ * configurations to have a setup like
+ * node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of
+ * pages do not belong to a single zone.
+ */
+ page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+ /* Check the block is suitable for migration */
+ if (!suitable_migration_target(page))
+ continue;
+
+ /* If isolation recently failed, do not retry */
+ if (!isolation_suitable(cc, page))
+ continue;
+
+ /* Found a block suitable for isolating free pages from */
+ isolated = 0;
+
+ /*
+ * As pfn may not start aligned, pfn+pageblock_nr_page
+ * may cross a MAX_ORDER_NR_PAGES boundary and miss
+ * a pfn_valid check. Ensure isolate_freepages_block()
+ * only scans within a pageblock
+ */
+ end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ end_pfn = min(end_pfn, z_end_pfn);
+ isolated = isolate_freepages_block(cc, pfn, end_pfn,
+ freelist, false);
+ nr_freepages += isolated;
+
+ /*
+ * Record the highest PFN we isolated pages from. When next
+ * looking for free pages, the search will restart here as
+ * page migration may have returned some pages to the allocator
+ */
+ if (isolated) {
+ cc->finished_update_free = true;
+ high_pfn = max(high_pfn, pfn);
+ }
+ }
+
+ /* split_free_page does not map the pages */
+ map_pages(freelist);
+
+ cc->free_pfn = high_pfn;
+ cc->nr_freepages = nr_freepages;
+}
+
+/*
+ * This is a migrate-callback that "allocates" freepages by taking pages
+ * from the isolated freelists in the block we are migrating to.
+ */
+static struct page *compaction_alloc(struct page *migratepage,
+ unsigned long data,
+ int **result)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+ struct page *freepage;
+
+ /* Isolate free pages if necessary */
+ if (list_empty(&cc->freepages)) {
+ isolate_freepages(cc->zone, cc);
+
+ if (list_empty(&cc->freepages))
+ return NULL;
+ }
+
+ freepage = list_entry(cc->freepages.next, struct page, lru);
+ list_del(&freepage->lru);
+ cc->nr_freepages--;
+
+ return freepage;
+}
+
+/*
+ * We cannot control nr_migratepages and nr_freepages fully when migration is
+ * running as migrate_pages() has no knowledge of compact_control. When
+ * migration is complete, we count the number of pages on the lists by hand.
+ */
+static void update_nr_listpages(struct compact_control *cc)
+{
+ int nr_migratepages = 0;
+ int nr_freepages = 0;
+ struct page *page;
+
+ list_for_each_entry(page, &cc->migratepages, lru)
+ nr_migratepages++;
+ list_for_each_entry(page, &cc->freepages, lru)
+ nr_freepages++;
+
+ cc->nr_migratepages = nr_migratepages;
+ cc->nr_freepages = nr_freepages;
+}
+
+/* possible outcome of isolate_migratepages */
+typedef enum {
+ ISOLATE_ABORT, /* Abort compaction now */
+ ISOLATE_NONE, /* No pages isolated, continue scanning */
+ ISOLATE_SUCCESS, /* Pages isolated, migrate */
+} isolate_migrate_t;
+
+/*
+ * Isolate all pages that can be migrated from the block pointed to by
+ * the migrate scanner within compact_control.
+ */
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned long low_pfn, end_pfn;
+
+ /* Do not scan outside zone boundaries */
+ low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+
+ /* Only scan within a pageblock boundary */
+ end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+
+ /* Do not cross the free scanner or scan within a memory hole */
+ if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
+ cc->migrate_pfn = end_pfn;
+ return ISOLATE_NONE;
+ }
+
+ /* Perform the isolation */
+ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
+ if (!low_pfn || cc->contended)
+ return ISOLATE_ABORT;
+
+ cc->migrate_pfn = low_pfn;
+
+ return ISOLATE_SUCCESS;
+}
+
+static int compact_finished(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned int order;
+ unsigned long watermark;
+
+ if (fatal_signal_pending(current))
+ return COMPACT_PARTIAL;
+
+ /* Compaction run completes if the migrate and free scanner meet */
+ if (cc->free_pfn <= cc->migrate_pfn) {
+ /*
+ * Mark that the PG_migrate_skip information should be cleared
+ * by kswapd when it goes to sleep. kswapd does not set the
+ * flag itself as the decision to be clear should be directly
+ * based on an allocation request.
+ */
+ if (!current_is_kswapd())
+ zone->compact_blockskip_flush = true;
+
+ return COMPACT_COMPLETE;
+ }
+
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (cc->order == -1)
+ return COMPACT_CONTINUE;
+
+ /* Compaction run is not finished if the watermark is not met */
+ watermark = low_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
+ if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+ return COMPACT_CONTINUE;
+
+ /* Direct compactor: Is a suitable page free? */
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&area->free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (cc->order >= pageblock_order && area->nr_free)
+ return COMPACT_PARTIAL;
+ }
+
+ return COMPACT_CONTINUE;
+}
+
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ int fragindex;
+ unsigned long watermark;
+
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note the 2UL.
+ * This is because during migration, copies of pages need to be
+ * allocated and for a short time, the footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return COMPACT_SKIPPED;
+
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1000 implies allocations might succeed depending on
+ * watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ return COMPACT_SKIPPED;
+
+ if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+ 0, 0))
+ return COMPACT_PARTIAL;
+
+ return COMPACT_CONTINUE;
+}
+
+static int compact_zone(struct zone *zone, struct compact_control *cc)
+{
+ int ret;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+ case COMPACT_PARTIAL:
+ case COMPACT_SKIPPED:
+ /* Compaction is likely to fail */
+ return ret;
+ case COMPACT_CONTINUE:
+ /* Fall through to compaction */
+ ;
+ }
+
+ /*
+ * Setup to move all movable pages to the end of the zone. Used cached
+ * information on where the scanners should start but check that it
+ * is initialised by ensuring the values are within zone boundaries.
+ */
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+ cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ zone->compact_cached_free_pfn = cc->free_pfn;
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ cc->migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+ }
+
+ /*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
+ migrate_prep_local();
+
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ unsigned long nr_migrate, nr_remaining;
+ int err;
+
+ switch (isolate_migratepages(zone, cc)) {
+ case ISOLATE_ABORT:
+ ret = COMPACT_PARTIAL;
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ goto out;
+ case ISOLATE_NONE:
+ continue;
+ case ISOLATE_SUCCESS:
+ ;
+ }
+
+ nr_migrate = cc->nr_migratepages;
+ err = migrate_pages(&cc->migratepages, compaction_alloc,
+ (unsigned long)cc,
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+ MR_COMPACTION);
+ update_nr_listpages(cc);
+ nr_remaining = cc->nr_migratepages;
+
+ trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+ nr_remaining);
+
+ /* Release isolated pages not migrated */
+ if (err) {
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ if (err == -ENOMEM) {
+ ret = COMPACT_PARTIAL;
+ goto out;
+ }
+ }
+ }
+
+out:
+ /* Release free pages and check accounting */
+ cc->nr_freepages -= release_freepages(&cc->freepages);
+ VM_BUG_ON(cc->nr_freepages != 0);
+
+ return ret;
+}
+
+static unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask,
+ bool sync, bool *contended)
+{
+ unsigned long ret;
+ struct compact_control cc = {
+ .nr_freepages = 0,
+ .nr_migratepages = 0,
+ .order = order,
+ .migratetype = allocflags_to_migratetype(gfp_mask),
+ .zone = zone,
+ .sync = sync,
+ };
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ ret = compact_zone(zone, &cc);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+
+ *contended = cc.contended;
+ return ret;
+}
+
+int sysctl_extfrag_threshold = 500;
+
+/**
+ * try_to_compact_pages - Direct compact to satisfy a high-order allocation
+ * @zonelist: The zonelist used for the current allocation
+ * @order: The order of the current allocation
+ * @gfp_mask: The GFP mask of the current allocation
+ * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
+ *
+ * This is the main entry point for direct page compaction.
+ */
+unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ bool sync, bool *contended)
+{
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ int may_enter_fs = gfp_mask & __GFP_FS;
+ int may_perform_io = gfp_mask & __GFP_IO;
+ struct zoneref *z;
+ struct zone *zone;
+ int rc = COMPACT_SKIPPED;
+ int alloc_flags = 0;
+
+ /* Check if the GFP flags allow compaction */
+ if (!order || !may_enter_fs || !may_perform_io)
+ return rc;
+
+ count_compact_event(COMPACTSTALL);
+
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
+ /* Compact each zone in the list */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+ nodemask) {
+ int status;
+
+ status = compact_zone_order(zone, order, gfp_mask, sync,
+ contended);
+ rc = max(status, rc);
+
+ /* If a normal allocation would succeed, stop compacting */
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+ alloc_flags))
+ break;
+ }
+
+ return rc;
+}
+
+
+/* Compact all zones within a node */
+static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+{
+ int zoneid;
+ struct zone *zone;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc->nr_freepages = 0;
+ cc->nr_migratepages = 0;
+ cc->zone = zone;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
+ if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+ compact_zone(zone, cc);
+
+ if (cc->order > 0) {
+ int ok = zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0);
+ if (ok && cc->order >= zone->compact_order_failed)
+ zone->compact_order_failed = cc->order + 1;
+ /* Currently async compaction is never deferred. */
+ else if (!ok && cc->sync)
+ defer_compaction(zone, cc->order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc->freepages));
+ VM_BUG_ON(!list_empty(&cc->migratepages));
+ }
+}
+
+void compact_pgdat(pg_data_t *pgdat, int order)
+{
+ struct compact_control cc = {
+ .order = order,
+ .sync = false,
+ };
+
+ __compact_pgdat(pgdat, &cc);
+}
+
+static void compact_node(int nid)
+{
+ struct compact_control cc = {
+ .order = -1,
+ .sync = true,
+ };
+
+ __compact_pgdat(NODE_DATA(nid), &cc);
+}
+
+/* Compact all nodes in the system */
+static void compact_nodes(void)
+{
+ int nid;
+
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ for_each_online_node(nid)
+ compact_node(nid);
+}
+
+/* The written value is actually unused, all memory is compacted */
+int sysctl_compact_memory;
+
+/* This is the entry point for compacting all nodes via /proc/sys/vm */
+int sysctl_compaction_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ if (write)
+ compact_nodes();
+
+ return 0;
+}
+
+int sysctl_extfrag_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ return 0;
+}
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+ssize_t sysfs_compact_node(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int nid = dev->id;
+
+ if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ compact_node(nid);
+ }
+
+ return count;
+}
+static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
+
+int compaction_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_compact);
+}
+
+void compaction_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_compact);
+}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+
+#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 00000000..789ff70c
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,102 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/page-debug-flags.h>
+#include <linux/poison.h>
+#include <linux/ratelimit.h>
+
+static inline void set_page_poison(struct page *page)
+{
+ __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline bool page_poison(struct page *page)
+{
+ return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static void poison_page(struct page *page)
+{
+ void *addr = kmap_atomic(page);
+
+ set_page_poison(page);
+ memset(addr, PAGE_POISON, PAGE_SIZE);
+ kunmap_atomic(addr);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+ unsigned char error = a ^ b;
+
+ return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
+ unsigned char *start;
+ unsigned char *end;
+
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
+ return;
+
+ for (end = mem + bytes - 1; end > start; end--) {
+ if (*end != PAGE_POISON)
+ break;
+ }
+
+ if (!__ratelimit(&ratelimit))
+ return;
+ else if (start == end && single_bit_flip(*start, PAGE_POISON))
+ printk(KERN_ERR "pagealloc: single bit error\n");
+ else
+ printk(KERN_ERR "pagealloc: memory corruption\n");
+
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+ end - start + 1, 1);
+ dump_stack();
+}
+
+static void unpoison_page(struct page *page)
+{
+ void *addr;
+
+ if (!page_poison(page))
+ return;
+
+ addr = kmap_atomic(page);
+ check_poison_mem(addr, PAGE_SIZE);
+ clear_page_poison(page);
+ kunmap_atomic(addr);
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ unpoison_page(page + i);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (enable)
+ unpoison_pages(page, numpages);
+ else
+ poison_pages(page, numpages);
+}
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 00000000..c69781e9
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,514 @@
+/*
+ * DMA Pool allocator
+ *
+ * Copyright 2001 David Brownell
+ * Copyright 2007 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 as published by the
+ * Free Software Foundation.
+ *
+ * This allocator returns small blocks of a given size which are DMA-able by
+ * the given device. It uses the dma_alloc_coherent page allocator to get
+ * new pages, then splits them up into blocks of the required size.
+ * Many older drivers still have their own code to do this.
+ *
+ * The current design of this allocator is fairly simple. The pool is
+ * represented by the 'struct dma_pool' which keeps a doubly-linked list of
+ * allocated pages. Each page in the page_list is split into blocks of at
+ * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
+ * list of free blocks within the page. Used blocks aren't tracked, but we
+ * keep a count of how many are currently allocated from each page.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+#define DMAPOOL_DEBUG 1
+#endif
+
+struct dma_pool { /* the pool */
+ struct list_head page_list;
+ spinlock_t lock;
+ size_t size;
+ struct device *dev;
+ size_t allocation;
+ size_t boundary;
+ char name[32];
+ struct list_head pools;
+};
+
+struct dma_page { /* cacheable header for 'allocation' bytes */
+ struct list_head page_list;
+ void *vaddr;
+ dma_addr_t dma;
+ unsigned int in_use;
+ unsigned int offset;
+};
+
+static DEFINE_MUTEX(pools_lock);
+
+static ssize_t
+show_pools(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ unsigned temp;
+ unsigned size;
+ char *next;
+ struct dma_page *page;
+ struct dma_pool *pool;
+
+ next = buf;
+ size = PAGE_SIZE;
+
+ temp = scnprintf(next, size, "poolinfo - 0.1\n");
+ size -= temp;
+ next += temp;
+
+ mutex_lock(&pools_lock);
+ list_for_each_entry(pool, &dev->dma_pools, pools) {
+ unsigned pages = 0;
+ unsigned blocks = 0;
+
+ spin_lock_irq(&pool->lock);
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ pages++;
+ blocks += page->in_use;
+ }
+ spin_unlock_irq(&pool->lock);
+
+ /* per-pool info, no real statistics yet */
+ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+ pool->name, blocks,
+ pages * (pool->allocation / pool->size),
+ pool->size, pages);
+ size -= temp;
+ next += temp;
+ }
+ mutex_unlock(&pools_lock);
+
+ return PAGE_SIZE - size;
+}
+
+static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
+
+/**
+ * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @boundary: returned blocks won't cross this power of two boundary
+ * Context: !in_interrupt()
+ *
+ * Returns a dma allocation pool with the requested characteristics, or
+ * null if one can't be created. Given one of these pools, dma_pool_alloc()
+ * may be used to allocate memory. Such memory will all have "consistent"
+ * DMA mappings, accessible by the device and its driver without using
+ * cache flushing primitives. The actual size of blocks allocated may be
+ * larger than requested because of alignment.
+ *
+ * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
+ * cross that size boundary. This is useful for devices which have
+ * addressing restrictions on individual DMA transfers, such as not crossing
+ * boundaries of 4KBytes.
+ */
+struct dma_pool *dma_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary)
+{
+ struct dma_pool *retval;
+ size_t allocation;
+
+ if (align == 0) {
+ align = 1;
+ } else if (align & (align - 1)) {
+ return NULL;
+ }
+
+ if (size == 0) {
+ return NULL;
+ } else if (size < 4) {
+ size = 4;
+ }
+
+ if ((size % align) != 0)
+ size = ALIGN(size, align);
+
+ allocation = max_t(size_t, size, PAGE_SIZE);
+
+ if (!boundary) {
+ boundary = allocation;
+ } else if ((boundary < size) || (boundary & (boundary - 1))) {
+ return NULL;
+ }
+
+ retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ if (!retval)
+ return retval;
+
+ strlcpy(retval->name, name, sizeof(retval->name));
+
+ retval->dev = dev;
+
+ INIT_LIST_HEAD(&retval->page_list);
+ spin_lock_init(&retval->lock);
+ retval->size = size;
+ retval->boundary = boundary;
+ retval->allocation = allocation;
+
+ if (dev) {
+ int ret;
+
+ mutex_lock(&pools_lock);
+ if (list_empty(&dev->dma_pools))
+ ret = device_create_file(dev, &dev_attr_pools);
+ else
+ ret = 0;
+ /* note: not currently insisting "name" be unique */
+ if (!ret)
+ list_add(&retval->pools, &dev->dma_pools);
+ else {
+ kfree(retval);
+ retval = NULL;
+ }
+ mutex_unlock(&pools_lock);
+ } else
+ INIT_LIST_HEAD(&retval->pools);
+
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_create);
+
+static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
+{
+ unsigned int offset = 0;
+ unsigned int next_boundary = pool->boundary;
+
+ do {
+ unsigned int next = offset + pool->size;
+ if (unlikely((next + pool->size) >= next_boundary)) {
+ next = next_boundary;
+ next_boundary += pool->boundary;
+ }
+ *(int *)(page->vaddr + offset) = next;
+ offset = next;
+ } while (offset < pool->allocation);
+}
+
+static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
+{
+ struct dma_page *page;
+
+ page = kmalloc(sizeof(*page), mem_flags);
+ if (!page)
+ return NULL;
+ page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
+ &page->dma, mem_flags);
+ if (page->vaddr) {
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ pool_initialise_page(pool, page);
+ page->in_use = 0;
+ page->offset = 0;
+ } else {
+ kfree(page);
+ page = NULL;
+ }
+ return page;
+}
+
+static inline int is_page_busy(struct dma_page *page)
+{
+ return page->in_use != 0;
+}
+
+static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
+{
+ dma_addr_t dma = page->dma;
+
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
+ list_del(&page->page_list);
+ kfree(page);
+}
+
+/**
+ * dma_pool_destroy - destroys a pool of dma memory blocks.
+ * @pool: dma pool that will be destroyed
+ * Context: !in_interrupt()
+ *
+ * Caller guarantees that no more memory from the pool is in use,
+ * and that nothing will try to use the pool after this call.
+ */
+void dma_pool_destroy(struct dma_pool *pool)
+{
+ mutex_lock(&pools_lock);
+ list_del(&pool->pools);
+ if (pool->dev && list_empty(&pool->dev->dma_pools))
+ device_remove_file(pool->dev, &dev_attr_pools);
+ mutex_unlock(&pools_lock);
+
+ while (!list_empty(&pool->page_list)) {
+ struct dma_page *page;
+ page = list_entry(pool->page_list.next,
+ struct dma_page, page_list);
+ if (is_page_busy(page)) {
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ else
+ printk(KERN_ERR
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ /* leak the still-in-use consistent memory */
+ list_del(&page->page_list);
+ kfree(page);
+ } else
+ pool_free_page(pool, page);
+ }
+
+ kfree(pool);
+}
+EXPORT_SYMBOL(dma_pool_destroy);
+
+/**
+ * dma_pool_alloc - get a block of consistent memory
+ * @pool: dma pool that will produce the block
+ * @mem_flags: GFP_* bitmask
+ * @handle: pointer to dma address of block
+ *
+ * This returns the kernel virtual address of a currently unused block,
+ * and reports its dma address through the handle.
+ * If such a memory block can't be allocated, %NULL is returned.
+ */
+void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+{
+ unsigned long flags;
+ struct dma_page *page;
+ size_t offset;
+ void *retval;
+
+ might_sleep_if(mem_flags & __GFP_WAIT);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (page->offset < pool->allocation)
+ goto ready;
+ }
+
+ /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ page = pool_alloc_page(pool, mem_flags);
+ if (!page)
+ return NULL;
+
+ spin_lock_irqsave(&pool->lock, flags);
+
+ list_add(&page->page_list, &pool->page_list);
+ ready:
+ page->in_use++;
+ offset = page->offset;
+ page->offset = *(int *)(page->vaddr + offset);
+ retval = offset + page->vaddr;
+ *handle = offset + page->dma;
+#ifdef DMAPOOL_DEBUG
+ {
+ int i;
+ u8 *data = retval;
+ /* page->offset is stored in first 4 bytes */
+ for (i = sizeof(page->offset); i < pool->size; i++) {
+ if (data[i] == POOL_POISON_FREED)
+ continue;
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_alloc %s, %p (corruped)\n",
+ pool->name, retval);
+ else
+ pr_err("dma_pool_alloc %s, %p (corruped)\n",
+ pool->name, retval);
+
+ /*
+ * Dump the first 4 bytes even if they are not
+ * POOL_POISON_FREED
+ */
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+ data, pool->size, 1);
+ break;
+ }
+ }
+ memset(retval, POOL_POISON_ALLOCATED, pool->size);
+#endif
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_alloc);
+
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+ struct dma_page *page;
+
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (dma < page->dma)
+ continue;
+ if (dma < (page->dma + pool->allocation))
+ return page;
+ }
+ return NULL;
+}
+
+/**
+ * dma_pool_free - put block back into dma pool
+ * @pool: the dma pool holding the block
+ * @vaddr: virtual address of block
+ * @dma: dma address of block
+ *
+ * Caller promises neither device nor driver will again touch this block
+ * unless it is first re-allocated.
+ */
+void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+ struct dma_page *page;
+ unsigned long flags;
+ unsigned int offset;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ page = pool_find_page(pool, dma);
+ if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pool->name, vaddr, (unsigned long)dma);
+ else
+ printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pool->name, vaddr, (unsigned long)dma);
+ return;
+ }
+
+ offset = vaddr - page->vaddr;
+#ifdef DMAPOOL_DEBUG
+ if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pool->name, vaddr, (unsigned long long)dma);
+ else
+ printk(KERN_ERR
+ "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pool->name, vaddr, (unsigned long long)dma);
+ return;
+ }
+ {
+ unsigned int chain = page->offset;
+ while (chain < pool->allocation) {
+ if (chain != offset) {
+ chain = *(int *)(page->vaddr + chain);
+ continue;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
+ "already free\n", pool->name,
+ (unsigned long long)dma);
+ else
+ printk(KERN_ERR "dma_pool_free %s, dma %Lx "
+ "already free\n", pool->name,
+ (unsigned long long)dma);
+ return;
+ }
+ }
+ memset(vaddr, POOL_POISON_FREED, pool->size);
+#endif
+
+ page->in_use--;
+ *(int *)vaddr = page->offset;
+ page->offset = offset;
+ /*
+ * Resist a temptation to do
+ * if (!is_page_busy(page)) pool_free_page(pool, page);
+ * Better have a few empty pages hang around.
+ */
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+EXPORT_SYMBOL(dma_pool_free);
+
+/*
+ * Managed DMA pool
+ */
+static void dmam_pool_release(struct device *dev, void *res)
+{
+ struct dma_pool *pool = *(struct dma_pool **)res;
+
+ dma_pool_destroy(pool);
+}
+
+static int dmam_pool_match(struct device *dev, void *res, void *match_data)
+{
+ return *(struct dma_pool **)res == match_data;
+}
+
+/**
+ * dmam_pool_create - Managed dma_pool_create()
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @allocation: returned blocks won't cross this boundary (or zero)
+ *
+ * Managed dma_pool_create(). DMA pool created with this function is
+ * automatically destroyed on driver detach.
+ */
+struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t allocation)
+{
+ struct dma_pool **ptr, *pool;
+
+ ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return NULL;
+
+ pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
+ if (pool)
+ devres_add(dev, ptr);
+ else
+ devres_free(ptr);
+
+ return pool;
+}
+EXPORT_SYMBOL(dmam_pool_create);
+
+/**
+ * dmam_pool_destroy - Managed dma_pool_destroy()
+ * @pool: dma pool that will be destroyed
+ *
+ * Managed dma_pool_destroy().
+ */
+void dmam_pool_destroy(struct dma_pool *pool)
+{
+ struct device *dev = pool->dev;
+
+ WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+ dma_pool_destroy(pool);
+}
+EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/fadvise.c b/mm/fadvise.c
new file mode 100644
index 00000000..7e092689
--- /dev/null
+++ b/mm/fadvise.c
@@ -0,0 +1,170 @@
+/*
+ * mm/fadvise.c
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 11Jan2003 Andrew Morton
+ * Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/fadvise.h>
+#include <linux/writeback.h>
+#include <linux/syscalls.h>
+#include <linux/swap.h>
+
+#include <asm/unistd.h>
+
+/*
+ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
+ * deactivate the pages and clear PG_Referenced.
+ */
+SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
+{
+ struct fd f = fdget(fd);
+ struct address_space *mapping;
+ struct backing_dev_info *bdi;
+ loff_t endbyte; /* inclusive */
+ pgoff_t start_index;
+ pgoff_t end_index;
+ unsigned long nrpages;
+ int ret = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+ ret = -ESPIPE;
+ goto out;
+ }
+
+ mapping = f.file->f_mapping;
+ if (!mapping || len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (mapping->a_ops->get_xip_mem) {
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_NOREUSE:
+ case POSIX_FADV_DONTNEED:
+ /* no bad return value, but ignore advice */
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ goto out;
+ }
+
+ /* Careful about overflows. Len == 0 means "as much as possible" */
+ endbyte = offset + len;
+ if (!len || endbyte < len)
+ endbyte = -1;
+ else
+ endbyte--; /* inclusive */
+
+ bdi = mapping->backing_dev_info;
+
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ f.file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
+ break;
+ case POSIX_FADV_RANDOM:
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
+ break;
+ case POSIX_FADV_SEQUENTIAL:
+ f.file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
+ break;
+ case POSIX_FADV_WILLNEED:
+ /* First and last PARTIAL page! */
+ start_index = offset >> PAGE_CACHE_SHIFT;
+ end_index = endbyte >> PAGE_CACHE_SHIFT;
+
+ /* Careful about overflow on the "+1" */
+ nrpages = end_index - start_index + 1;
+ if (!nrpages)
+ nrpages = ~0UL;
+
+ /*
+ * Ignore return value because fadvise() shall return
+ * success even if filesystem can't retrieve a hint,
+ */
+ force_page_cache_readahead(mapping, f.file, start_index,
+ nrpages);
+ break;
+ case POSIX_FADV_NOREUSE:
+ break;
+ case POSIX_FADV_DONTNEED:
+ if (!bdi_write_congested(mapping->backing_dev_info))
+ __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
+
+ /* First and last FULL page! */
+ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+ end_index = (endbyte >> PAGE_CACHE_SHIFT);
+
+ if (end_index >= start_index) {
+ unsigned long count = invalidate_mapping_pages(mapping,
+ start_index, end_index);
+
+ /*
+ * If fewer pages were invalidated than expected then
+ * it is possible that some of the pages were on
+ * a per-cpu pagevec for a remote CPU. Drain all
+ * pagevecs and try again.
+ */
+ if (count < (end_index - start_index + 1)) {
+ lru_add_drain_all();
+ invalidate_mapping_pages(mapping, start_index,
+ end_index);
+ }
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out:
+ fdput(f);
+ return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
+{
+ return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
+}
+SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
+#endif
+
+#ifdef __ARCH_WANT_SYS_FADVISE64
+
+SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
+{
+ return sys_fadvise64_64(fd, offset, len, advice);
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
+{
+ return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
+}
+SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
+#endif
+
+#endif
diff --git a/mm/failslab.c b/mm/failslab.c
new file mode 100644
index 00000000..fefaabaa
--- /dev/null
+++ b/mm/failslab.c
@@ -0,0 +1,60 @@
+#include <linux/fault-inject.h>
+#include <linux/slab.h>
+
+static struct {
+ struct fault_attr attr;
+ u32 ignore_gfp_wait;
+ int cache_filter;
+} failslab = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+ .cache_filter = 0,
+};
+
+bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+{
+ if (gfpflags & __GFP_NOFAIL)
+ return false;
+
+ if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+ return false;
+
+ if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ return false;
+
+ return should_fail(&failslab.attr, size);
+}
+
+static int __init setup_failslab(char *str)
+{
+ return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+static int __init failslab_debugfs_init(void)
+{
+ struct dentry *dir;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+
+ dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_wait))
+ goto fail;
+ if (!debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter))
+ goto fail;
+
+ return 0;
+fail:
+ debugfs_remove_recursive(dir);
+
+ return -ENOMEM;
+}
+
+late_initcall(failslab_debugfs_init);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/filemap.c b/mm/filemap.c
new file mode 100644
index 00000000..e1979fdc
--- /dev/null
+++ b/mm/filemap.c
@@ -0,0 +1,2578 @@
+/*
+ * linux/mm/filemap.c
+ *
+ * Copyright (C) 1994-1999 Linus Torvalds
+ */
+
+/*
+ * This file handles the generic file mmap semantics used by
+ * most "normal" filesystems (but you don't /have/ to use this:
+ * the NFS filesystem used to do this differently, for example)
+ */
+#include <linux/export.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/aio.h>
+#include <linux/capability.h>
+#include <linux/kernel_stat.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/hash.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/security.h>
+#include <linux/cpuset.h>
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/memcontrol.h>
+#include <linux/cleancache.h>
+#include "internal.h"
+
+/*
+ * FIXME: remove all knowledge of the buffer layer from the core VM
+ */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
+
+#include <asm/mman.h>
+
+/*
+ * Shared mappings implemented 30.11.1994. It's not fully working yet,
+ * though.
+ *
+ * Shared mappings now work. 15.8.1995 Bruno.
+ *
+ * finished 'unifying' the page and buffer cache and SMP-threaded the
+ * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
+ */
+
+/*
+ * Lock ordering:
+ *
+ * ->i_mmap_mutex (truncate_pagecache)
+ * ->private_lock (__free_pte->__set_page_dirty_buffers)
+ * ->swap_lock (exclusive_swap_page, others)
+ * ->mapping->tree_lock
+ *
+ * ->i_mutex
+ * ->i_mmap_mutex (truncate->unmap_mapping_range)
+ *
+ * ->mmap_sem
+ * ->i_mmap_mutex
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
+ * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
+ *
+ * ->mmap_sem
+ * ->lock_page (access_process_vm)
+ *
+ * ->i_mutex (generic_file_buffered_write)
+ * ->mmap_sem (fault_in_pages_readable->do_page_fault)
+ *
+ * bdi->wb.list_lock
+ * sb_lock (fs/fs-writeback.c)
+ * ->mapping->tree_lock (__sync_single_inode)
+ *
+ * ->i_mmap_mutex
+ * ->anon_vma.lock (vma_adjust)
+ *
+ * ->anon_vma.lock
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
+ *
+ * ->page_table_lock or pte_lock
+ * ->swap_lock (try_to_unmap_one)
+ * ->private_lock (try_to_unmap_one)
+ * ->tree_lock (try_to_unmap_one)
+ * ->zone.lru_lock (follow_page->mark_page_accessed)
+ * ->zone.lru_lock (check_pte_range->isolate_lru_page)
+ * ->private_lock (page_remove_rmap->set_page_dirty)
+ * ->tree_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
+ * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
+ *
+ * ->i_mmap_mutex
+ * ->tasklist_lock (memory_failure, collect_procs_ao)
+ */
+
+/*
+ * Delete a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe. The caller must hold the mapping's tree_lock.
+ */
+void __delete_from_page_cache(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ /*
+ * if we're uptodate, flush out into the cleancache, otherwise
+ * invalidate any existing cleancache entries. We can't leave
+ * stale data around in the cleancache once our page is gone
+ */
+ if (PageUptodate(page) && PageMappedToDisk(page))
+ cleancache_put_page(page);
+ else
+ cleancache_invalidate_page(mapping, page);
+
+ radix_tree_delete(&mapping->page_tree, page->index);
+ page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
+ mapping->nrpages--;
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ if (PageSwapBacked(page))
+ __dec_zone_page_state(page, NR_SHMEM);
+ BUG_ON(page_mapped(page));
+
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ *
+ * Fix it up by doing a final dirty accounting check after
+ * having removed the page entirely.
+ */
+ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ }
+}
+
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked. It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
+
+ BUG_ON(!PageLocked(page));
+
+ freepage = mapping->a_ops->freepage;
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
+ page_cache_release(page);
+}
+EXPORT_SYMBOL(delete_from_page_cache);
+
+static int sleep_on_page(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+static int sleep_on_page_killable(void *word)
+{
+ sleep_on_page(word);
+ return fatal_signal_pending(current) ? -EINTR : 0;
+}
+
+/**
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @start: offset in bytes where the range starts
+ * @end: offset in bytes where the range ends (inclusive)
+ * @sync_mode: enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback. The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
+{
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = sync_mode,
+ .nr_to_write = LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
+
+ if (!mapping_cap_writeback_dirty(mapping))
+ return 0;
+
+ ret = do_writepages(mapping, &wbc);
+ return ret;
+}
+
+static inline int __filemap_fdatawrite(struct address_space *mapping,
+ int sync_mode)
+{
+ return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
+}
+
+int filemap_fdatawrite(struct address_space *mapping)
+{
+ return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite);
+
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
+}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
+
+/**
+ * filemap_flush - mostly a non-blocking flush
+ * @mapping: target address_space
+ *
+ * This is a mostly non-blocking flush. Not suitable for data-integrity
+ * purposes - I/O may not be started against all dirty pages.
+ */
+int filemap_flush(struct address_space *mapping)
+{
+ return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
+}
+EXPORT_SYMBOL(filemap_flush);
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+ loff_t end_byte)
+{
+ pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
+ struct pagevec pvec;
+ int nr_pages;
+ int ret = 0;
+
+ if (end_byte < start_byte)
+ return 0;
+
+ pagevec_init(&pvec, 0);
+ while ((index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_WRITEBACK,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /* until radix tree lookup accepts end_index */
+ if (page->index > end)
+ continue;
+
+ wait_on_page_writeback(page);
+ if (TestClearPageError(page))
+ ret = -EIO;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ /* Check for outstanding write errors */
+ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+ if (test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+
+ return ret;
+}
+EXPORT_SYMBOL(filemap_fdatawait_range);
+
+/**
+ * filemap_fdatawait - wait for all under-writeback pages to complete
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.
+ */
+int filemap_fdatawait(struct address_space *mapping)
+{
+ loff_t i_size = i_size_read(mapping->host);
+
+ if (i_size == 0)
+ return 0;
+
+ return filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+EXPORT_SYMBOL(filemap_fdatawait);
+
+int filemap_write_and_wait(struct address_space *mapping)
+{
+ int err = 0;
+
+ if (mapping->nrpages) {
+ err = filemap_fdatawrite(mapping);
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
+ if (err != -EIO) {
+ int err2 = filemap_fdatawait(mapping);
+ if (!err)
+ err = err2;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(filemap_write_and_wait);
+
+/**
+ * filemap_write_and_wait_range - write out & wait on a file range
+ * @mapping: the address_space for the pages
+ * @lstart: offset in bytes where the range starts
+ * @lend: offset in bytes where the range ends (inclusive)
+ *
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
+int filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int err = 0;
+
+ if (mapping->nrpages) {
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO) {
+ int err2 = filemap_fdatawait_range(mapping,
+ lstart, lend);
+ if (!err)
+ err = err2;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(filemap_write_and_wait_range);
+
+/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old: page to be replaced
+ * @new: page to replace with
+ * @gfp_mask: allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page. Both the old and new pages must be
+ * locked. This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic. The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+ int error;
+
+ VM_BUG_ON(!PageLocked(old));
+ VM_BUG_ON(!PageLocked(new));
+ VM_BUG_ON(new->mapping);
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (!error) {
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *);
+
+ pgoff_t offset = old->index;
+ freepage = mapping->a_ops->freepage;
+
+ page_cache_get(new);
+ new->mapping = mapping;
+ new->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(old);
+ error = radix_tree_insert(&mapping->page_tree, offset, new);
+ BUG_ON(error);
+ mapping->nrpages++;
+ __inc_zone_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(new))
+ __inc_zone_page_state(new, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ /* mem_cgroup codes must not be called under tree_lock */
+ mem_cgroup_replace_page_cache(old, new);
+ radix_tree_preload_end();
+ if (freepage)
+ freepage(old);
+ page_cache_release(old);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
+/**
+ * add_to_page_cache_locked - add a locked page to the pagecache
+ * @page: page to add
+ * @mapping: the page's address_space
+ * @offset: page index
+ * @gfp_mask: page allocation mode
+ *
+ * This function is used to add a page to the pagecache. It must be locked.
+ * This function does not add the page to the LRU. The caller must do that.
+ */
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ int error;
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
+
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp_mask & GFP_RECLAIM_MASK);
+ if (error)
+ goto out;
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (error == 0) {
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ error = radix_tree_insert(&mapping->page_tree, offset, page);
+ if (likely(!error)) {
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
+ spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
+ page_cache_release(page);
+ }
+ radix_tree_preload_end();
+ } else
+ mem_cgroup_uncharge_cache_page(page);
+out:
+ return error;
+}
+EXPORT_SYMBOL(add_to_page_cache_locked);
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ int ret;
+
+ ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+ if (ret == 0)
+ lru_cache_add_file(page);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
+
+#ifdef CONFIG_NUMA
+struct page *__page_cache_alloc(gfp_t gfp)
+{
+ int n;
+ struct page *page;
+
+ if (cpuset_do_page_mem_spread()) {
+ unsigned int cpuset_mems_cookie;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
+ return page;
+ }
+ return alloc_pages(gfp, 0);
+}
+EXPORT_SYMBOL(__page_cache_alloc);
+#endif
+
+/*
+ * In order to wait for pages to become available there must be
+ * waitqueues associated with pages. By using a hash table of
+ * waitqueues where the bucket discipline is to maintain all
+ * waiters on the same queue and wake all when any of the pages
+ * become available, and for the woken contexts to check to be
+ * sure the appropriate page became available, this saves space
+ * at a cost of "thundering herd" phenomena during rare hash
+ * collisions.
+ */
+static wait_queue_head_t *page_waitqueue(struct page *page)
+{
+ const struct zone *zone = page_zone(page);
+
+ return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
+}
+
+static inline void wake_up_page(struct page *page, int bit)
+{
+ __wake_up_bit(page_waitqueue(page), &page->flags, bit);
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ if (test_bit(bit_nr, &page->flags))
+ __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
+ TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ if (!test_bit(bit_nr, &page->flags))
+ return 0;
+
+ return __wait_on_bit(page_waitqueue(page), &wait,
+ sleep_on_page_killable, TASK_KILLABLE);
+}
+
+/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page: Page defining the wait queue of interest
+ * @waiter: Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, waiter);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
+/**
+ * unlock_page - unlock a locked page
+ * @page: the page
+ *
+ * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Also wakes sleepers in wait_on_page_writeback() because the wakeup
+ * mechananism between PageLocked pages and PageWriteback pages is shared.
+ * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ *
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+ */
+void unlock_page(struct page *page)
+{
+ VM_BUG_ON(!PageLocked(page));
+ clear_bit_unlock(PG_locked, &page->flags);
+ smp_mb__after_clear_bit();
+ wake_up_page(page, PG_locked);
+}
+EXPORT_SYMBOL(unlock_page);
+
+/**
+ * end_page_writeback - end writeback against a page
+ * @page: the page
+ */
+void end_page_writeback(struct page *page)
+{
+ if (TestClearPageReclaim(page))
+ rotate_reclaimable_page(page);
+
+ if (!test_clear_page_writeback(page))
+ BUG();
+
+ smp_mb__after_clear_bit();
+ wake_up_page(page, PG_writeback);
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+/**
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @page: the page to lock
+ */
+void __lock_page(struct page *page)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
+ TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(__lock_page);
+
+int __lock_page_killable(struct page *page)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+ return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ sleep_on_page_killable, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(__lock_page_killable);
+
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
+{
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ /*
+ * CAUTION! In this case, mmap_sem is not released
+ * even though return 0.
+ */
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
+ up_read(&mm->mmap_sem);
+ if (flags & FAULT_FLAG_KILLABLE)
+ wait_on_page_locked_killable(page);
+ else
+ wait_on_page_locked(page);
+ return 0;
+ } else {
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
+
+ ret = __lock_page_killable(page);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
+ }
+}
+
+/**
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Is there a pagecache struct page at the given (mapping, offset) tuple?
+ * If yes, increment its refcount and return it; if no, return NULL.
+ */
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
+{
+ void **pagep;
+ struct page *page;
+
+ rcu_read_lock();
+repeat:
+ page = NULL;
+ pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+ if (pagep) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto repeat;
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ goto out;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+ }
+out:
+ rcu_read_unlock();
+
+ return page;
+}
+EXPORT_SYMBOL(find_get_page);
+
+/**
+ * find_lock_page - locate, pin and lock a pagecache page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Locates the desired pagecache page, locks it, increments its reference
+ * count and returns its address.
+ *
+ * Returns zero if the page was not present. find_lock_page() may sleep.
+ */
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
+{
+ struct page *page;
+
+repeat:
+ page = find_get_page(mapping, offset);
+ if (page && !radix_tree_exception(page)) {
+ lock_page(page);
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON(page->index != offset);
+ }
+ return page;
+}
+EXPORT_SYMBOL(find_lock_page);
+
+/**
+ * find_or_create_page - locate or add a pagecache page
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
+ *
+ * Locates a page in the pagecache. If the page is not present, a new page
+ * is allocated using @gfp_mask and is added to the pagecache and to the VM's
+ * LRU list. The returned page is locked and has its reference count
+ * incremented.
+ *
+ * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
+ * allocation!
+ *
+ * find_or_create_page() returns the desired page's address, or zero on
+ * memory exhaustion.
+ */
+struct page *find_or_create_page(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp_mask)
+{
+ struct page *page;
+ int err;
+repeat:
+ page = find_lock_page(mapping, index);
+ if (!page) {
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return NULL;
+ /*
+ * We want a regular kernel memory (not highmem or DMA etc)
+ * allocation for the radix tree nodes, but we need to honour
+ * the context-specific requirements the caller has asked for.
+ * GFP_RECLAIM_MASK collects those requirements.
+ */
+ err = add_to_page_cache_lru(page, mapping, index,
+ (gfp_mask & GFP_RECLAIM_MASK));
+ if (unlikely(err)) {
+ page_cache_release(page);
+ page = NULL;
+ if (err == -EEXIST)
+ goto repeat;
+ }
+ }
+ return page;
+}
+EXPORT_SYMBOL(find_or_create_page);
+
+/**
+ * find_get_pages - gang pagecache lookup
+ * @mapping: The address_space to search
+ * @start: The starting page index
+ * @nr_pages: The maximum number of pages
+ * @pages: Where the resulting pages are placed
+ *
+ * find_get_pages() will search for and return a group of up to
+ * @nr_pages pages in the mapping. The pages are placed at @pages.
+ * find_get_pages() takes a reference against the returned pages.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes. There may be holes in the indices due to not-present pages.
+ *
+ * find_get_pages() returns the number of pages which were found.
+ */
+unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_pages, struct page **pages)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(iter.index);
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so skip over it -
+ * we only reach this from invalidate_mapping_pages().
+ */
+ continue;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ if (++ret == nr_pages)
+ break;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * find_get_pages_contig - gang contiguous pagecache lookup
+ * @mapping: The address_space to search
+ * @index: The starting page index
+ * @nr_pages: The maximum number of pages
+ * @pages: Where the resulting pages are placed
+ *
+ * find_get_pages_contig() works exactly like find_get_pages(), except
+ * that the returned number of pages are guaranteed to be contiguous.
+ *
+ * find_get_pages_contig() returns the number of pages which were found.
+ */
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+ unsigned int nr_pages, struct page **pages)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned int ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ /* The hole, there no reason to continue */
+ if (unlikely(!page))
+ break;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so stop looking for
+ * contiguous pages.
+ */
+ break;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != iter.index) {
+ page_cache_release(page);
+ break;
+ }
+
+ pages[ret] = page;
+ if (++ret == nr_pages)
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(find_get_pages_contig);
+
+/**
+ * find_get_pages_tag - find and return pages that match @tag
+ * @mapping: the address_space to search
+ * @index: the starting page index
+ * @tag: the tag index
+ * @nr_pages: the maximum number of pages
+ * @pages: where the resulting pages are placed
+ *
+ * Like find_get_pages, except we only return pages which are tagged with
+ * @tag. We update @index to index the next page for the traversal.
+ */
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+ int tag, unsigned int nr_pages, struct page **pages)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_tagged(slot, &mapping->page_tree,
+ &iter, *index, tag) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * This function is never used on a shmem/tmpfs
+ * mapping, so a swap entry won't be found here.
+ */
+ BUG();
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ if (++ret == nr_pages)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ if (ret)
+ *index = pages[ret - 1]->index + 1;
+
+ return ret;
+}
+EXPORT_SYMBOL(find_get_pages_tag);
+
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
+ * Same as grab_cache_page(), but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed. This routine should
+ * be safe to call while holding the lock for another page.
+ *
+ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+ * and deadlock against the caller's locked page.
+ */
+struct page *
+grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
+{
+ struct page *page = find_get_page(mapping, index);
+
+ if (page) {
+ if (trylock_page(page))
+ return page;
+ page_cache_release(page);
+ return NULL;
+ }
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL(grab_cache_page_nowait);
+
+/*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+ *
+ * ---R__________________________________________B__________
+ * ^ reading here ^ bad block(assume 4k)
+ *
+ * read(R) => miss => readahead(R...B) => media error => frustrating retries
+ * => failing the whole request => read(R) => read(R+1) =>
+ * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
+ * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
+ * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
+ *
+ * It is going insane. Fix it by quickly scaling down the readahead size.
+ */
+static void shrink_readahead_size_eio(struct file *filp,
+ struct file_ra_state *ra)
+{
+ ra->ra_pages /= 4;
+}
+
+/**
+ * do_generic_file_read - generic file read routine
+ * @filp: the file to read
+ * @ppos: current file position
+ * @desc: read_descriptor
+ * @actor: read method
+ *
+ * This is a generic file read routine, and uses the
+ * mapping->a_ops->readpage() function for the actual low-level stuff.
+ *
+ * This is really ugly. But the goto's actually try to clarify some
+ * of the logic when it comes to error handling etc.
+ */
+static void do_generic_file_read(struct file *filp, loff_t *ppos,
+ read_descriptor_t *desc, read_actor_t actor)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct file_ra_state *ra = &filp->f_ra;
+ pgoff_t index;
+ pgoff_t last_index;
+ pgoff_t prev_index;
+ unsigned long offset; /* offset into pagecache page */
+ unsigned int prev_offset;
+ int error;
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
+ prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
+ last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_CACHE_MASK;
+
+ for (;;) {
+ struct page *page;
+ pgoff_t end_index;
+ loff_t isize;
+ unsigned long nr, ret;
+
+ cond_resched();
+find_page:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page_cache_sync_readahead(mapping,
+ ra, filp,
+ index, last_index - index);
+ page = find_get_page(mapping, index);
+ if (unlikely(page == NULL))
+ goto no_cached_page;
+ }
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(mapping,
+ ra, filp, page,
+ index, last_index - index);
+ }
+ if (!PageUptodate(page)) {
+ if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+ !mapping->a_ops->is_partially_uptodate)
+ goto page_not_up_to_date;
+ if (!trylock_page(page))
+ goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
+ if (!mapping->a_ops->is_partially_uptodate(page,
+ desc, offset))
+ goto page_not_up_to_date_locked;
+ unlock_page(page);
+ }
+page_ok:
+ /*
+ * i_size must be checked after we know the page is Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ page_cache_release(page);
+ goto out;
+ }
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_CACHE_SIZE;
+ if (index == end_index) {
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ page_cache_release(page);
+ goto out;
+ }
+ }
+ nr = nr - offset;
+
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ /*
+ * When a sequential read accesses a page several times,
+ * only mark it as accessed the first time.
+ */
+ if (prev_index != index || offset != prev_offset)
+ mark_page_accessed(page);
+ prev_index = index;
+
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ ret = actor(desc, page, offset, nr);
+ offset += ret;
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+ prev_offset = offset;
+
+ page_cache_release(page);
+ if (ret == nr && desc->count)
+ continue;
+ goto out;
+
+page_not_up_to_date:
+ /* Get exclusive access to the page ... */
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
+
+page_not_up_to_date_locked:
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+
+ /* Did somebody else fill it already? */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto page_ok;
+ }
+
+readpage:
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures, eg. multipath errors.
+ * PG_error will be set again if readpage fails.
+ */
+ ClearPageError(page);
+ /* Start the actual read. The read will unlock the page. */
+ error = mapping->a_ops->readpage(filp, page);
+
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto find_page;
+ }
+ goto readpage_error;
+ }
+
+ if (!PageUptodate(page)) {
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
+ if (!PageUptodate(page)) {
+ if (page->mapping == NULL) {
+ /*
+ * invalidate_mapping_pages got it
+ */
+ unlock_page(page);
+ page_cache_release(page);
+ goto find_page;
+ }
+ unlock_page(page);
+ shrink_readahead_size_eio(filp, ra);
+ error = -EIO;
+ goto readpage_error;
+ }
+ unlock_page(page);
+ }
+
+ goto page_ok;
+
+readpage_error:
+ /* UHHUH! A synchronous read error occurred. Report it */
+ desc->error = error;
+ page_cache_release(page);
+ goto out;
+
+no_cached_page:
+ /*
+ * Ok, it wasn't cached, so we need to create a new
+ * page..
+ */
+ page = page_cache_alloc_cold(mapping);
+ if (!page) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+ error = add_to_page_cache_lru(page, mapping,
+ index, GFP_KERNEL);
+ if (error) {
+ page_cache_release(page);
+ if (error == -EEXIST)
+ goto find_page;
+ desc->error = error;
+ goto out;
+ }
+ goto readpage;
+ }
+
+out:
+ ra->prev_pos = prev_index;
+ ra->prev_pos <<= PAGE_CACHE_SHIFT;
+ ra->prev_pos |= prev_offset;
+
+ *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
+ file_accessed(filp);
+}
+
+int file_read_actor(read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
+{
+ char *kaddr;
+ unsigned long left, count = desc->count;
+
+ if (size > count)
+ size = count;
+
+ /*
+ * Faults on the destination of a read are common, so do it before
+ * taking the kmap.
+ */
+ if (!fault_in_pages_writeable(desc->arg.buf, size)) {
+ kaddr = kmap_atomic(page);
+ left = __copy_to_user_inatomic(desc->arg.buf,
+ kaddr + offset, size);
+ kunmap_atomic(kaddr);
+ if (left == 0)
+ goto success;
+ }
+
+ /* Do it the slow way */
+ kaddr = kmap(page);
+ left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+ kunmap(page);
+
+ if (left) {
+ size -= left;
+ desc->error = -EFAULT;
+ }
+success:
+ desc->count = count - size;
+ desc->written += size;
+ desc->arg.buf += size;
+ return size;
+}
+
+/*
+ * Performs necessary checks before doing a write
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @count: number of bytes to write
+ * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
+ *
+ * Adjust number of segments and amount of bytes to write (nr_segs should be
+ * properly initialized first). Returns appropriate error code that caller
+ * should return or zero in case that write should be allowed.
+ */
+int generic_segment_checks(const struct iovec *iov,
+ unsigned long *nr_segs, size_t *count, int access_flags)
+{
+ unsigned long seg;
+ size_t cnt = 0;
+ for (seg = 0; seg < *nr_segs; seg++) {
+ const struct iovec *iv = &iov[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ cnt += iv->iov_len;
+ if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+ continue;
+ if (seg == 0)
+ return -EFAULT;
+ *nr_segs = seg;
+ cnt -= iv->iov_len; /* This segment is no good */
+ break;
+ }
+ *count = cnt;
+ return 0;
+}
+EXPORT_SYMBOL(generic_segment_checks);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @pos: current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *filp = iocb->ki_filp;
+ ssize_t retval;
+ unsigned long seg = 0;
+ size_t count;
+ loff_t *ppos = &iocb->ki_pos;
+
+ count = 0;
+ retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (retval)
+ return retval;
+
+ /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+ if (filp->f_flags & O_DIRECT) {
+ loff_t size;
+ struct address_space *mapping;
+ struct inode *inode;
+
+ mapping = filp->f_mapping;
+ inode = mapping->host;
+ if (!count)
+ goto out; /* skip atime */
+ size = i_size_read(inode);
+ if (pos < size) {
+ retval = filemap_write_and_wait_range(mapping, pos,
+ pos + iov_length(iov, nr_segs) - 1);
+ if (!retval) {
+ retval = mapping->a_ops->direct_IO(READ, iocb,
+ iov, pos, nr_segs);
+ }
+ if (retval > 0) {
+ *ppos = pos + retval;
+ count -= retval;
+ }
+
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !count || *ppos >= size) {
+ file_accessed(filp);
+ goto out;
+ }
+ }
+ }
+
+ count = retval;
+ for (seg = 0; seg < nr_segs; seg++) {
+ read_descriptor_t desc;
+ loff_t offset = 0;
+
+ /*
+ * If we did a short DIO read we need to skip the section of the
+ * iov that we've already read data into.
+ */
+ if (count) {
+ if (count > iov[seg].iov_len) {
+ count -= iov[seg].iov_len;
+ continue;
+ }
+ offset = count;
+ count = 0;
+ }
+
+ desc.written = 0;
+ desc.arg.buf = iov[seg].iov_base + offset;
+ desc.count = iov[seg].iov_len - offset;
+ if (desc.count == 0)
+ continue;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
+ retval += desc.written;
+ if (desc.error) {
+ retval = retval ?: desc.error;
+ break;
+ }
+ if (desc.count > 0)
+ break;
+ }
+out:
+ return retval;
+}
+EXPORT_SYMBOL(generic_file_aio_read);
+
+#ifdef CONFIG_MMU
+/**
+ * page_cache_read - adds requested page to the page cache if not already there
+ * @file: file to read
+ * @offset: page index
+ *
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
+ */
+static int page_cache_read(struct file *file, pgoff_t offset)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct page *page;
+ int ret;
+
+ do {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ if (ret == 0)
+ ret = mapping->a_ops->readpage(file, page);
+ else if (ret == -EEXIST)
+ ret = 0; /* losing race to add is OK */
+
+ page_cache_release(page);
+
+ } while (ret == AOP_TRUNCATED_PAGE);
+
+ return ret;
+}
+
+#define MMAP_LOTSAMISS (100)
+
+/*
+ * Synchronous readahead happens when we don't even find
+ * a page in the page cache at all.
+ */
+static void do_sync_mmap_readahead(struct vm_area_struct *vma,
+ struct file_ra_state *ra,
+ struct file *file,
+ pgoff_t offset)
+{
+ unsigned long ra_pages;
+ struct address_space *mapping = file->f_mapping;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (VM_RandomReadHint(vma))
+ return;
+ if (!ra->ra_pages)
+ return;
+
+ if (VM_SequentialReadHint(vma)) {
+ page_cache_sync_readahead(mapping, ra, file, offset,
+ ra->ra_pages);
+ return;
+ }
+
+ /* Avoid banging the cache line if not needed */
+ if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
+ ra->mmap_miss++;
+
+ /*
+ * Do we miss much more than hit in this file? If so,
+ * stop bothering with read-ahead. It will only hurt.
+ */
+ if (ra->mmap_miss > MMAP_LOTSAMISS)
+ return;
+
+ /*
+ * mmap read-around
+ */
+ ra_pages = max_sane_readahead(ra->ra_pages);
+ ra->start = max_t(long, 0, offset - ra_pages / 2);
+ ra->size = ra_pages;
+ ra->async_size = ra_pages / 4;
+ ra_submit(ra, mapping, file);
+}
+
+/*
+ * Asynchronous readahead happens when we find the page and PG_readahead,
+ * so we want to possibly extend the readahead further..
+ */
+static void do_async_mmap_readahead(struct vm_area_struct *vma,
+ struct file_ra_state *ra,
+ struct file *file,
+ struct page *page,
+ pgoff_t offset)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (VM_RandomReadHint(vma))
+ return;
+ if (ra->mmap_miss > 0)
+ ra->mmap_miss--;
+ if (PageReadahead(page))
+ page_cache_async_readahead(mapping, ra, file,
+ page, offset, ra->ra_pages);
+}
+
+/**
+ * filemap_fault - read in file data for page fault handling
+ * @vma: vma in which the fault was taken
+ * @vmf: struct vm_fault containing details of the fault
+ *
+ * filemap_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ */
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int error;
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct file_ra_state *ra = &file->f_ra;
+ struct inode *inode = mapping->host;
+ pgoff_t offset = vmf->pgoff;
+ struct page *page;
+ pgoff_t size;
+ int ret = 0;
+
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (offset >= size)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Do we have something in the page cache already?
+ */
+ page = find_get_page(mapping, offset);
+ if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+ /*
+ * We found the page, so try async readahead before
+ * waiting for the lock.
+ */
+ do_async_mmap_readahead(vma, ra, file, page, offset);
+ } else if (!page) {
+ /* No page in the page cache at all */
+ do_sync_mmap_readahead(vma, ra, file, offset);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ ret = VM_FAULT_MAJOR;
+retry_find:
+ page = find_get_page(mapping, offset);
+ if (!page)
+ goto no_cached_page;
+ }
+
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return ret | VM_FAULT_RETRY;
+ }
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON(page->index != offset);
+
+ /*
+ * We have a locked page in the page cache, now we need to check
+ * that it's up-to-date. If not, it is going to be due to an error.
+ */
+ if (unlikely(!PageUptodate(page)))
+ goto page_not_uptodate;
+
+ /*
+ * Found the page and have a reference on it.
+ * We must recheck i_size under page lock.
+ */
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(offset >= size)) {
+ unlock_page(page);
+ page_cache_release(page);
+ return VM_FAULT_SIGBUS;
+ }
+
+ vmf->page = page;
+ return ret | VM_FAULT_LOCKED;
+
+no_cached_page:
+ /*
+ * We're only likely to ever get here if MADV_RANDOM is in
+ * effect.
+ */
+ error = page_cache_read(file, offset);
+
+ /*
+ * The page we want has now been added to the page cache.
+ * In the unlikely event that someone removed it in the
+ * meantime, we'll just come back here and read it again.
+ */
+ if (error >= 0)
+ goto retry_find;
+
+ /*
+ * An error return from page_cache_read can result if the
+ * system is low on memory, or a problem occurs while trying
+ * to schedule I/O.
+ */
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM;
+ return VM_FAULT_SIGBUS;
+
+page_not_uptodate:
+ /*
+ * Umm, take care of errors if the page isn't up-to-date.
+ * Try to re-read it _once_. We do this synchronously,
+ * because there really aren't any performance issues here
+ * and we need to check for errors.
+ */
+ ClearPageError(page);
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ error = -EIO;
+ }
+ page_cache_release(page);
+
+ if (!error || error == AOP_TRUNCATED_PAGE)
+ goto retry_find;
+
+ /* Things didn't work out. Return zero to tell the mm layer so. */
+ shrink_readahead_size_eio(file, ra);
+ return VM_FAULT_SIGBUS;
+}
+EXPORT_SYMBOL(filemap_fault);
+
+int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret = VM_FAULT_LOCKED;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+ /*
+ * We mark the page dirty already here so that when freeze is in
+ * progress, we are guaranteed that writeback during freezing will
+ * see the dirty page and writeprotect it again.
+ */
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_page_mkwrite);
+
+const struct vm_operations_struct generic_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
+};
+
+/* This is used for a general mmap of a disk file */
+
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &generic_file_vm_ops;
+ return 0;
+}
+
+/*
+ * This is for filesystems which do not implement ->writepage.
+ */
+int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+ return generic_file_mmap(file, vma);
+}
+#else
+int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ return -ENOSYS;
+}
+int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_MMU */
+
+EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap);
+
+static struct page *__read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data,
+ gfp_t gfp)
+{
+ struct page *page;
+ int err;
+repeat:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page = __page_cache_alloc(gfp | __GFP_COLD);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ err = add_to_page_cache_lru(page, mapping, index, gfp);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ if (err == -EEXIST)
+ goto repeat;
+ /* Presumably ENOMEM for radix tree node */
+ return ERR_PTR(err);
+ }
+ err = filler(data, page);
+ if (err < 0) {
+ page_cache_release(page);
+ page = ERR_PTR(err);
+ }
+ }
+ return page;
+}
+
+static struct page *do_read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data,
+ gfp_t gfp)
+
+{
+ struct page *page;
+ int err;
+
+retry:
+ page = __read_cache_page(mapping, index, filler, data, gfp);
+ if (IS_ERR(page))
+ return page;
+ if (PageUptodate(page))
+ goto out;
+
+ lock_page(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto retry;
+ }
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ err = filler(data, page);
+ if (err < 0) {
+ page_cache_release(page);
+ return ERR_PTR(err);
+ }
+out:
+ mark_page_accessed(page);
+ return page;
+}
+
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: first arg to filler(data, page) function, often left as NULL
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data)
+{
+ return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(read_cache_page_async);
+
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+ pgoff_t index,
+ gfp_t gfp)
+{
+ filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+
+ return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
+
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: first arg to filler(data, page) function, often left as NULL
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page then wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data)
+{
+ return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
+}
+EXPORT_SYMBOL(read_cache_page);
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were successfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ BUG_ON(!in_atomic());
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr);
+
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap(page);
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct iovec *iov = i->iov;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
+ int copy;
+
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->iov = iov;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+ const struct iovec *iov = i->iov;
+ if (i->nr_segs == 1)
+ return i->count;
+ else
+ return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
+ */
+inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned long limit = rlimit(RLIMIT_FSIZE);
+
+ if (unlikely(*pos < 0))
+ return -EINVAL;
+
+ if (!isblk) {
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (file->f_flags & O_APPEND)
+ *pos = i_size_read(inode);
+
+ if (limit != RLIM_INFINITY) {
+ if (*pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ if (*count > limit - (typeof(limit))*pos) {
+ *count = limit - (typeof(limit))*pos;
+ }
+ }
+ }
+
+ /*
+ * LFS rule
+ */
+ if (unlikely(*pos + *count > MAX_NON_LFS &&
+ !(file->f_flags & O_LARGEFILE))) {
+ if (*pos >= MAX_NON_LFS) {
+ return -EFBIG;
+ }
+ if (*count > MAX_NON_LFS - (unsigned long)*pos) {
+ *count = MAX_NON_LFS - (unsigned long)*pos;
+ }
+ }
+
+ /*
+ * Are we about to exceed the fs block limit ?
+ *
+ * If we have written data it becomes a short write. If we have
+ * exceeded without writing data we send a signal and return EFBIG.
+ * Linus frestrict idea will clean these up nicely..
+ */
+ if (likely(!isblk)) {
+ if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
+ if (*count || *pos > inode->i_sb->s_maxbytes) {
+ return -EFBIG;
+ }
+ /* zero-length writes at ->s_maxbytes are OK */
+ }
+
+ if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
+ *count = inode->i_sb->s_maxbytes - *pos;
+ } else {
+#ifdef CONFIG_BLOCK
+ loff_t isize;
+ if (bdev_read_only(I_BDEV(inode)))
+ return -EPERM;
+ isize = i_size_read(inode);
+ if (*pos >= isize) {
+ if (*count || *pos > isize)
+ return -ENOSPC;
+ }
+
+ if (*pos + *count > isize)
+ *count = isize - *pos;
+#else
+ return -EPERM;
+#endif
+ }
+ return 0;
+}
+EXPORT_SYMBOL(generic_write_checks);
+
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ return aops->write_begin(file, mapping, pos, len, flags,
+ pagep, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ mark_page_accessed(page);
+ return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_end);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, size_t ocount)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t written;
+ size_t write_len;
+ pgoff_t end;
+
+ if (count != ocount)
+ *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+
+ write_len = iov_length(iov, *nr_segs);
+ end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+
+ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
+ if (written)
+ goto out;
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ if (mapping->nrpages) {
+ written = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ goto out;
+ }
+ }
+
+ written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+
+ /*
+ * Finally, try again to invalidate clean pages which might have been
+ * cached by non-direct readahead, or faulted in by get_user_pages()
+ * if the source of the write was an mmap'ed region of the file
+ * we're writing. Either one is a pretty crazy thing to do,
+ * so we don't support it 100%. If this invalidation
+ * fails, tough, the write still worked...
+ */
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ }
+
+ if (written > 0) {
+ pos += written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ *ppos = pos;
+ }
+out:
+ return written;
+}
+EXPORT_SYMBOL(generic_file_direct_write);
+
+/*
+ * Find or create a page at the given pagecache position. Return the locked
+ * page. This function is specifically for buffered writes.
+ */
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
+{
+ int status;
+ gfp_t gfp_mask;
+ struct page *page;
+ gfp_t gfp_notmask = 0;
+
+ gfp_mask = mapping_gfp_mask(mapping);
+ if (mapping_cap_account_dirty(mapping))
+ gfp_mask |= __GFP_WRITE;
+ if (flags & AOP_FLAG_NOFS)
+ gfp_notmask = __GFP_FS;
+repeat:
+ page = find_lock_page(mapping, index);
+ if (page)
+ goto found;
+
+ page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
+ if (!page)
+ return NULL;
+ status = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & ~gfp_notmask);
+ if (unlikely(status)) {
+ page_cache_release(page);
+ if (status == -EEXIST)
+ goto repeat;
+ return NULL;
+ }
+found:
+ wait_for_stable_page(page);
+ return page;
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
+
+static ssize_t generic_perform_write(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = 0;
+
+ /*
+ * Copies from kernel address space cannot fail (NFSD is a big user).
+ */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ do {
+ struct page *page;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+ void *fsdata;
+
+ offset = (pos & (PAGE_CACHE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_count(i));
+
+again:
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ &page, &fsdata);
+ if (unlikely(status))
+ break;
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
+ flush_dcache_page(page);
+
+ mark_page_accessed(page);
+ status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ page, fsdata);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+
+ balance_dirty_pages_ratelimited(mapping);
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+ } while (iov_iter_count(i));
+
+ return written ? written : status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t status;
+ struct iov_iter i;
+
+ iov_iter_init(&i, iov, nr_segs, count, written);
+ status = generic_perform_write(file, &i, pos);
+
+ if (likely(status >= 0)) {
+ written += status;
+ *ppos = pos + status;
+ }
+
+ return written ? written : status;
+}
+EXPORT_SYMBOL(generic_file_buffered_write);
+
+/**
+ * __generic_file_aio_write - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @iov: vector with data to write
+ * @nr_segs: number of segments in the vector
+ * @ppos: position where to write
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * object which does not need locking at all.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_mutex.
+ */
+ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
+ struct inode *inode = mapping->host;
+ loff_t pos;
+ ssize_t written;
+ ssize_t err;
+
+ ocount = 0;
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err)
+ return err;
+
+ count = ocount;
+ pos = *ppos;
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+ written = 0;
+
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ loff_t endbyte;
+ ssize_t written_buffered;
+
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+ ppos, count, ocount);
+ if (written < 0 || written == count)
+ goto out;
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ pos += written;
+ count -= written;
+ written_buffered = generic_file_buffered_write(iocb, iov,
+ nr_segs, pos, ppos, count,
+ written);
+ /*
+ * If generic_file_buffered_write() retuned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
+ }
+
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + written_buffered - written - 1;
+ err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ if (err == 0) {
+ written = written_buffered;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+ } else {
+ written = generic_file_buffered_write(iocb, iov, nr_segs,
+ pos, ppos, count, written);
+ }
+out:
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
+EXPORT_SYMBOL(__generic_file_aio_write);
+
+/**
+ * generic_file_aio_write - write data to a file
+ * @iocb: IO state structure
+ * @iov: vector with data to write
+ * @nr_segs: number of segments in the vector
+ * @pos: position in file where to write
+ *
+ * This is a wrapper around __generic_file_aio_write() to be used by most
+ * filesystems. It takes care of syncing the file in case of O_SYNC file
+ * and acquires i_mutex as needed.
+ */
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ sb_start_write(inode->i_sb);
+ mutex_lock(&inode->i_mutex);
+ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ sb_end_write(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(generic_file_aio_write);
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private). If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * This may also be called if PG_fscache is set on a page, indicating that the
+ * page is known to the local caching routines.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ *
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+ struct address_space * const mapping = page->mapping;
+
+ BUG_ON(!PageLocked(page));
+ if (PageWriteback(page))
+ return 0;
+
+ if (mapping && mapping->a_ops->releasepage)
+ return mapping->a_ops->releasepage(page, gfp_mask);
+ return try_to_free_buffers(page);
+}
+
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 00000000..a912da6d
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,486 @@
+/*
+ * linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/export.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/gfp.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+/*
+ * We do use our own empty page to avoid interference with other users
+ * of ZERO_PAGE(), such as /dev/zero
+ */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
+static struct page *__xip_sparse_page;
+
+/* called under xip_sparse_mutex */
+static struct page *xip_sparse_page(void)
+{
+ if (!__xip_sparse_page) {
+ struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+
+ if (page)
+ __xip_sparse_page = page;
+ }
+ return __xip_sparse_page;
+}
+
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all. It may be NULL.
+ */
+static ssize_t
+do_xip_mapping_read(struct address_space *mapping,
+ struct file_ra_state *_ra,
+ struct file *filp,
+ char __user *buf,
+ size_t len,
+ loff_t *ppos)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t index, end_index;
+ unsigned long offset;
+ loff_t isize, pos;
+ size_t copied = 0, error = 0;
+
+ BUG_ON(!mapping->a_ops->get_xip_mem);
+
+ pos = *ppos;
+ index = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_CACHE_MASK;
+
+ isize = i_size_read(inode);
+ if (!isize)
+ goto out;
+
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ do {
+ unsigned long nr, left;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int zero = 0;
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_CACHE_SIZE;
+ if (index >= end_index) {
+ if (index > end_index)
+ goto out;
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ goto out;
+ }
+ }
+ nr = nr - offset;
+ if (nr > len - copied)
+ nr = len - copied;
+
+ error = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(error)) {
+ if (error == -ENODATA) {
+ /* sparse */
+ zero = 1;
+ } else
+ goto out;
+ }
+
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ /* address based flush */ ;
+
+ /*
+ * Ok, we have the mem, so now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ if (!zero)
+ left = __copy_to_user(buf+copied, xip_mem+offset, nr);
+ else
+ left = __clear_user(buf + copied, nr);
+
+ if (left) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ copied += (nr - left);
+ offset += (nr - left);
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+ } while (copied < len);
+
+out:
+ *ppos = pos + copied;
+ if (filp)
+ file_accessed(filp);
+
+ return (copied ? copied : error);
+}
+
+ssize_t
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
+
+ return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+ buf, len, ppos);
+}
+EXPORT_SYMBOL_GPL(xip_file_read);
+
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * __xip_sparse_page when found at pgoff.
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+ unsigned long pgoff)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ unsigned long address;
+ pte_t *pte;
+ pte_t pteval;
+ spinlock_t *ptl;
+ struct page *page;
+ unsigned count;
+ int locked = 0;
+
+ count = read_seqcount_begin(&xip_sparse_seq);
+
+ page = __xip_sparse_page;
+ if (!page)
+ return;
+
+retry:
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ mm = vma->vm_mm;
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ pte = page_check_address(page, mm, address, &ptl, 1);
+ if (pte) {
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ pteval = ptep_clear_flush(vma, address, pte);
+ page_remove_rmap(page);
+ dec_mm_counter(mm, MM_FILEPAGES);
+ BUG_ON(pte_dirty(pteval));
+ pte_unmap_unlock(pte, ptl);
+ /* must invalidate_page _before_ freeing the page */
+ mmu_notifier_invalidate_page(mm, address);
+ page_cache_release(page);
+ }
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ if (locked) {
+ mutex_unlock(&xip_sparse_mutex);
+ } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+ mutex_lock(&xip_sparse_mutex);
+ locked = 1;
+ goto retry;
+ }
+}
+
+/*
+ * xip_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_fault, but used for execute in place
+ */
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t size;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ struct page *page;
+ int error;
+
+ /* XXX: are VM_FAULT_ codes OK? */
+again:
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (vmf->pgoff >= size)
+ return VM_FAULT_SIGBUS;
+
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (likely(!error))
+ goto found;
+ if (error != -ENODATA)
+ return VM_FAULT_OOM;
+
+ /* sparse block */
+ if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+ (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
+ (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+ int err;
+
+ /* maybe shared writable, allocate new block */
+ mutex_lock(&xip_sparse_mutex);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
+ &xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
+ if (error)
+ return VM_FAULT_SIGBUS;
+ /* unmap sparse mappings at pgoff from all other vmas */
+ __xip_unmap(mapping, vmf->pgoff);
+
+found:
+ err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+ xip_pfn);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ /*
+ * err == -EBUSY is fine, we've raced against another thread
+ * that faulted-in the same page
+ */
+ if (err != -EBUSY)
+ BUG_ON(err);
+ return VM_FAULT_NOPAGE;
+ } else {
+ int err, ret = VM_FAULT_OOM;
+
+ mutex_lock(&xip_sparse_mutex);
+ write_seqcount_begin(&xip_sparse_seq);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(!error)) {
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+ goto again;
+ }
+ if (error != -ENODATA)
+ goto out;
+ /* not shared and writable, use xip_sparse_page() */
+ page = xip_sparse_page();
+ if (!page)
+ goto out;
+ err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+ page);
+ if (err == -ENOMEM)
+ goto out;
+
+ ret = VM_FAULT_NOPAGE;
+out:
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+
+ return ret;
+ }
+}
+
+static const struct vm_operations_struct xip_file_vm_ops = {
+ .fault = xip_file_fault,
+ .page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
+};
+
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
+
+ file_accessed(file);
+ vma->vm_ops = &xip_file_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+
+static ssize_t
+__xip_file_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t pos, loff_t *ppos)
+{
+ struct address_space * mapping = filp->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status = 0;
+ size_t bytes;
+ ssize_t written = 0;
+
+ BUG_ON(!mapping->a_ops->get_xip_mem);
+
+ do {
+ unsigned long index;
+ unsigned long offset;
+ size_t copied;
+ void *xip_mem;
+ unsigned long xip_pfn;
+
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ index = pos >> PAGE_CACHE_SHIFT;
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ status = a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (status == -ENODATA) {
+ /* we allocate a new page unmap it */
+ mutex_lock(&xip_sparse_mutex);
+ status = a_ops->get_xip_mem(mapping, index, 1,
+ &xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
+ if (!status)
+ /* unmap page at pgoff from all other vmas */
+ __xip_unmap(mapping, index);
+ }
+
+ if (status)
+ break;
+
+ copied = bytes -
+ __copy_from_user_nocache(xip_mem + offset, buf, bytes);
+
+ if (likely(copied > 0)) {
+ status = copied;
+
+ if (status >= 0) {
+ written += status;
+ count -= status;
+ pos += status;
+ buf += status;
+ }
+ }
+ if (unlikely(copied != bytes))
+ if (status >= 0)
+ status = -EFAULT;
+ if (status < 0)
+ break;
+ } while (count);
+ *ppos = pos;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ */
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+
+ return written ? written : status;
+}
+
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ size_t count;
+ loff_t pos;
+ ssize_t ret;
+
+ sb_start_write(inode->i_sb);
+
+ mutex_lock(&inode->i_mutex);
+
+ if (!access_ok(VERIFY_READ, buf, len)) {
+ ret=-EFAULT;
+ goto out_up;
+ }
+
+ pos = *ppos;
+ count = len;
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+ ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out_backing;
+ if (count == 0)
+ goto out_backing;
+
+ ret = file_remove_suid(filp);
+ if (ret)
+ goto out_backing;
+
+ ret = file_update_time(filp);
+ if (ret)
+ goto out_backing;
+
+ ret = __xip_file_write (filp, buf, count, pos, ppos);
+
+ out_backing:
+ current->backing_dev_info = NULL;
+ out_up:
+ mutex_unlock(&inode->i_mutex);
+ sb_end_write(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_write);
+
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_mem
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize;
+ unsigned length;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int err;
+
+ BUG_ON(!mapping->a_ops->get_xip_mem);
+
+ blocksize = 1 << mapping->host->i_blkbits;
+ length = offset & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!length)
+ return 0;
+
+ length = blocksize - length;
+
+ err = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(err)) {
+ if (err == -ENODATA)
+ /* Hole? No need to truncate */
+ return 0;
+ else
+ return err;
+ }
+ memset(xip_mem + offset, 0, length);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
new file mode 100644
index 00000000..4723ac8d
--- /dev/null
+++ b/mm/fremap.c
@@ -0,0 +1,267 @@
+/*
+ * linux/mm/fremap.c
+ *
+ * Explicit pagetable population and nonlinear (random) mappings support.
+ *
+ * started by Ingo Molnar, Copyright (C) 2002, 2003
+ */
+#include <linux/export.h>
+#include <linux/backing-dev.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/rmap.h>
+#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t pte = *ptep;
+
+ if (pte_present(pte)) {
+ struct page *page;
+
+ flush_cache_page(vma, addr, pte_pfn(pte));
+ pte = ptep_clear_flush(vma, addr, ptep);
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+ page_remove_rmap(page);
+ page_cache_release(page);
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, MM_FILEPAGES);
+ }
+ } else {
+ if (!pte_file(pte))
+ free_swap_and_cache(pte_to_swp_entry(pte));
+ pte_clear_not_present_full(mm, addr, ptep, 0);
+ }
+}
+
+/*
+ * Install a file pte to a given virtual memory address, release any
+ * previously existing mapping.
+ */
+static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pgoff, pgprot_t prot)
+{
+ int err = -ENOMEM;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ goto out;
+
+ if (!pte_none(*pte))
+ zap_pte(mm, vma, addr, pte);
+
+ set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
+ /*
+ * We don't need to run update_mmu_cache() here because the "file pte"
+ * being installed by install_file_pte() is not a real pte - it's a
+ * non-present entry (like a swap entry), noting what file offset should
+ * be mapped there when there's a fault (in a non-linear vma where
+ * that's not obvious).
+ */
+ pte_unmap_unlock(pte, ptl);
+ err = 0;
+out:
+ return err;
+}
+
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, pgoff_t pgoff)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int err;
+
+ do {
+ err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
+ if (err)
+ return err;
+
+ size -= PAGE_SIZE;
+ addr += PAGE_SIZE;
+ pgoff++;
+ } while (size);
+
+ return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
+/**
+ * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
+ * @start: start of the remapped virtual memory range
+ * @size: size of the remapped virtual memory range
+ * @prot: new protection bits of the range (see NOTE)
+ * @pgoff: to-be-mapped page of the backing store file
+ * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
+ *
+ * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
+ * (shared backing store file).
+ *
+ * This syscall works purely via pagetables, so it's the most efficient
+ * way to map the same (large) file into a given virtual window. Unlike
+ * mmap()/mremap() it does not create any new vmas. The new mappings are
+ * also safe across swapout.
+ *
+ * NOTE: the @prot parameter right now is ignored (but must be zero),
+ * and the vma's default protection is used. Arbitrary protections
+ * might be implemented in the future.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct address_space *mapping;
+ struct vm_area_struct *vma;
+ int err = -EINVAL;
+ int has_write_lock = 0;
+ vm_flags_t vm_flags = 0;
+
+ if (prot)
+ return err;
+ /*
+ * Sanitize the syscall parameters:
+ */
+ start = start & PAGE_MASK;
+ size = size & PAGE_MASK;
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ if (start + size <= start)
+ return err;
+
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return err;
+
+ /* Can we represent this offset inside this architecture's pte's? */
+#if PTE_FILE_MAX_BITS < BITS_PER_LONG
+ if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
+ return err;
+#endif
+
+ /* We need down_write() to change vma->vm_flags. */
+ down_read(&mm->mmap_sem);
+ retry:
+ vma = find_vma(mm, start);
+
+ /*
+ * Make sure the vma is shared, that it supports prefaulting,
+ * and that the remapped range is valid and fully within
+ * the single existing vma.
+ */
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
+
+ if (!vma->vm_ops || !vma->vm_ops->remap_pages)
+ goto out;
+
+ if (start < vma->vm_start || start + size > vma->vm_end)
+ goto out;
+
+ /* Must set VM_NONLINEAR before any pages are populated. */
+ if (!(vma->vm_flags & VM_NONLINEAR)) {
+ /*
+ * vm_private_data is used as a swapout cursor
+ * in a VM_NONLINEAR vma.
+ */
+ if (vma->vm_private_data)
+ goto out;
+
+ /* Don't need a nonlinear mapping, exit success */
+ if (pgoff == linear_page_index(vma, start)) {
+ err = 0;
+ goto out;
+ }
+
+ if (!has_write_lock) {
+get_write_lock:
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ has_write_lock = 1;
+ goto retry;
+ }
+ mapping = vma->vm_file->f_mapping;
+ /*
+ * page_mkclean doesn't work on nonlinear vmas, so if
+ * dirty pages need to be accounted, emulate with linear
+ * vmas.
+ */
+ if (mapping_cap_account_dirty(mapping)) {
+ unsigned long addr;
+ struct file *file = get_file(vma->vm_file);
+
+ vm_flags = vma->vm_flags;
+ if (!(flags & MAP_NONBLOCK))
+ vm_flags |= VM_POPULATE;
+ addr = mmap_region(file, start, size, vm_flags, pgoff);
+ fput(file);
+ if (IS_ERR_VALUE(addr)) {
+ err = addr;
+ } else {
+ BUG_ON(addr != start);
+ err = 0;
+ }
+ goto out;
+ }
+ mutex_lock(&mapping->i_mmap_mutex);
+ flush_dcache_mmap_lock(mapping);
+ vma->vm_flags |= VM_NONLINEAR;
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
+ vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+ flush_dcache_mmap_unlock(mapping);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ }
+
+ if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
+ if (!has_write_lock)
+ goto get_write_lock;
+ vma->vm_flags |= VM_POPULATE;
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ /*
+ * drop PG_Mlocked flag for over-mapped range
+ */
+ if (!has_write_lock)
+ goto get_write_lock;
+ vm_flags = vma->vm_flags;
+ munlock_vma_pages_range(vma, start, start + size);
+ vma->vm_flags = vm_flags;
+ }
+
+ mmu_notifier_invalidate_range_start(mm, start, start + size);
+ err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
+ mmu_notifier_invalidate_range_end(mm, start, start + size);
+
+ /*
+ * We can't clear VM_NONLINEAR because we'd have to do
+ * it after ->populate completes, and that would prevent
+ * downgrading the lock. (Locks can't be upgraded).
+ */
+
+out:
+ if (vma)
+ vm_flags = vma->vm_flags;
+ if (likely(!has_write_lock))
+ up_read(&mm->mmap_sem);
+ else
+ up_write(&mm->mmap_sem);
+ if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
+ mm_populate(start, size);
+
+ return err;
+}
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 00000000..2890e67d
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,370 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * If enabled, frontswap_store will return failure even on success. As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache. In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured). These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+
+static inline void inc_frontswap_loads(void) {
+ frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+ frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+ frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+ frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+ struct frontswap_ops old = frontswap_ops;
+
+ frontswap_ops = *ops;
+ frontswap_enabled = true;
+ return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+ frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+
+/*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+ frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.init(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+ frontswap_clear(sis, offset);
+ atomic_dec(&sis->frontswap_pages);
+}
+
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+ int ret = -1, dup = 0;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ dup = 1;
+ ret = frontswap_ops.store(type, offset, page);
+ if (ret == 0) {
+ frontswap_set(sis, offset);
+ inc_frontswap_succ_stores();
+ if (!dup)
+ atomic_inc(&sis->frontswap_pages);
+ } else {
+ /*
+ failed dup always results in automatic invalidate of
+ the (older) page from frontswap
+ */
+ inc_frontswap_failed_stores();
+ if (dup)
+ __frontswap_clear(sis, offset);
+ }
+ if (frontswap_writethrough_enabled)
+ /* report failure so swap also writes to swap device */
+ ret = -1;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ ret = frontswap_ops.load(type, offset, page);
+ if (ret == 0) {
+ inc_frontswap_loads();
+ if (frontswap_tmem_exclusive_gets_enabled) {
+ SetPageDirty(page);
+ frontswap_clear(sis, offset);
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset)) {
+ frontswap_ops.invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
+ }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+static unsigned long __frontswap_curr_pages(void)
+{
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ totalpages += atomic_read(&si->frontswap_pages);
+ }
+ return totalpages;
+}
+
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ int *swapid)
+{
+ int ret = -EINVAL;
+ struct swap_info_struct *si = NULL;
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+ int type;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+ } else {
+ pages = si_frontswap_pages;
+ pages_to_unuse = 0; /* unuse all */
+ }
+ /* ensure there is enough RAM to fetch pages from frontswap */
+ if (security_vm_enough_memory_mm(current->mm, pages)) {
+ ret = -ENOMEM;
+ continue;
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+ *swapid = type;
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Used to check if it's necessory and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shink pages,
+ * error code when there is an error.
+ */
+static int __frontswap_shrink(unsigned long target_pages,
+ unsigned long *pages_to_unuse,
+ int *type)
+{
+ unsigned long total_pages = 0, total_pages_to_unuse;
+
+ assert_spin_locked(&swap_lock);
+
+ total_pages = __frontswap_curr_pages();
+ if (total_pages <= target_pages) {
+ /* Nothing to do */
+ *pages_to_unuse = 0;
+ return 1;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ unsigned long pages_to_unuse = 0;
+ int uninitialized_var(type), ret;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+ spin_unlock(&swap_lock);
+ if (ret == 0)
+ try_to_unuse(type, true, pages_to_unuse);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices. This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+
+ spin_lock(&swap_lock);
+ totalpages = __frontswap_curr_pages();
+ spin_unlock(&swap_lock);
+
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("frontswap", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", S_IRUGO, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &frontswap_invalidates);
+#endif
+ return 0;
+}
+
+module_init(init_frontswap);
diff --git a/mm/highmem.c b/mm/highmem.c
new file mode 100644
index 00000000..b32b70cd
--- /dev/null
+++ b/mm/highmem.c
@@ -0,0 +1,424 @@
+/*
+ * High memory handling common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * 64-bit physical space. With current x86 CPUs this
+ * means up to 64 Gigabytes physical RAM.
+ *
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/kgdb.h>
+#include <asm/tlbflush.h>
+
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
+
+/*
+ * Virtual_count is not a pure "count".
+ * 0 means that it is not mapped, and has not been mapped
+ * since a TLB flush - it is usable.
+ * 1 means that there are no users, but it has been mapped
+ * since the last TLB flush - so we can't use it.
+ * n means that there are (n-1) current users of it.
+ */
+#ifdef CONFIG_HIGHMEM
+
+unsigned long totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(totalhigh_pages);
+
+
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+
+unsigned int nr_free_highpages (void)
+{
+ pg_data_t *pgdat;
+ unsigned int pages = 0;
+
+ for_each_online_pgdat(pgdat) {
+ pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
+ NR_FREE_PAGES);
+ if (zone_movable_is_highmem())
+ pages += zone_page_state(
+ &pgdat->node_zones[ZONE_MOVABLE],
+ NR_FREE_PAGES);
+ }
+
+ return pages;
+}
+
+static int pkmap_count[LAST_PKMAP];
+static unsigned int last_pkmap_nr;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
+
+pte_t * pkmap_page_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+/*
+ * Most architectures have no use for kmap_high_get(), so let's abstract
+ * the disabling of IRQ out of the locking in that case to save on a
+ * potential useless overhead.
+ */
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+#define lock_kmap() spin_lock_irq(&kmap_lock)
+#define unlock_kmap() spin_unlock_irq(&kmap_lock)
+#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags)
+#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags)
+#else
+#define lock_kmap() spin_lock(&kmap_lock)
+#define unlock_kmap() spin_unlock(&kmap_lock)
+#define lock_kmap_any(flags) \
+ do { spin_lock(&kmap_lock); (void)(flags); } while (0)
+#define unlock_kmap_any(flags) \
+ do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
+#endif
+
+struct page *kmap_to_page(void *vaddr)
+{
+ unsigned long addr = (unsigned long)vaddr;
+
+ if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
+ int i = PKMAP_NR(addr);
+ return pte_page(pkmap_page_table[i]);
+ }
+
+ return virt_to_page(addr);
+}
+EXPORT_SYMBOL(kmap_to_page);
+
+static void flush_all_zero_pkmaps(void)
+{
+ int i;
+ int need_flush = 0;
+
+ flush_cache_kmaps();
+
+ for (i = 0; i < LAST_PKMAP; i++) {
+ struct page *page;
+
+ /*
+ * zero means we don't have anything to do,
+ * >1 means that it is still in use. Only
+ * a count of 1 means that it is free but
+ * needs to be unmapped
+ */
+ if (pkmap_count[i] != 1)
+ continue;
+ pkmap_count[i] = 0;
+
+ /* sanity check */
+ BUG_ON(pte_none(pkmap_page_table[i]));
+
+ /*
+ * Don't need an atomic fetch-and-clear op here;
+ * no-one has the page mapped, and cannot get at
+ * its virtual address (and hence PTE) without first
+ * getting the kmap_lock (which is held here).
+ * So no dangers, even with speculative execution.
+ */
+ page = pte_page(pkmap_page_table[i]);
+ pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
+
+ set_page_address(page, NULL);
+ need_flush = 1;
+ }
+ if (need_flush)
+ flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+}
+
+/**
+ * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
+ */
+void kmap_flush_unused(void)
+{
+ lock_kmap();
+ flush_all_zero_pkmaps();
+ unlock_kmap();
+}
+
+static inline unsigned long map_new_virtual(struct page *page)
+{
+ unsigned long vaddr;
+ int count;
+
+start:
+ count = LAST_PKMAP;
+ /* Find an empty entry */
+ for (;;) {
+ last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+ if (!last_pkmap_nr) {
+ flush_all_zero_pkmaps();
+ count = LAST_PKMAP;
+ }
+ if (!pkmap_count[last_pkmap_nr])
+ break; /* Found a usable entry */
+ if (--count)
+ continue;
+
+ /*
+ * Sleep for somebody else to unmap their entries
+ */
+ {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&pkmap_map_wait, &wait);
+ unlock_kmap();
+ schedule();
+ remove_wait_queue(&pkmap_map_wait, &wait);
+ lock_kmap();
+
+ /* Somebody else might have mapped it while we slept */
+ if (page_address(page))
+ return (unsigned long)page_address(page);
+
+ /* Re-start */
+ goto start;
+ }
+ }
+ vaddr = PKMAP_ADDR(last_pkmap_nr);
+ set_pte_at(&init_mm, vaddr,
+ &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+
+ pkmap_count[last_pkmap_nr] = 1;
+ set_page_address(page, (void *)vaddr);
+
+ return vaddr;
+}
+
+/**
+ * kmap_high - map a highmem page into memory
+ * @page: &struct page to map
+ *
+ * Returns the page's virtual memory address.
+ *
+ * We cannot call this from interrupts, as it may block.
+ */
+void *kmap_high(struct page *page)
+{
+ unsigned long vaddr;
+
+ /*
+ * For highmem pages, we can't trust "virtual" until
+ * after we have the lock.
+ */
+ lock_kmap();
+ vaddr = (unsigned long)page_address(page);
+ if (!vaddr)
+ vaddr = map_new_virtual(page);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
+ unlock_kmap();
+ return (void*) vaddr;
+}
+
+EXPORT_SYMBOL(kmap_high);
+
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+/**
+ * kmap_high_get - pin a highmem page into memory
+ * @page: &struct page to pin
+ *
+ * Returns the page's current virtual memory address, or NULL if no mapping
+ * exists. If and only if a non null address is returned then a
+ * matching call to kunmap_high() is necessary.
+ *
+ * This can be called from any context.
+ */
+void *kmap_high_get(struct page *page)
+{
+ unsigned long vaddr, flags;
+
+ lock_kmap_any(flags);
+ vaddr = (unsigned long)page_address(page);
+ if (vaddr) {
+ BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ }
+ unlock_kmap_any(flags);
+ return (void*) vaddr;
+}
+#endif
+
+/**
+ * kunmap_high - unmap a highmem page into memory
+ * @page: &struct page to unmap
+ *
+ * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
+ * only from user context.
+ */
+void kunmap_high(struct page *page)
+{
+ unsigned long vaddr;
+ unsigned long nr;
+ unsigned long flags;
+ int need_wakeup;
+
+ lock_kmap_any(flags);
+ vaddr = (unsigned long)page_address(page);
+ BUG_ON(!vaddr);
+ nr = PKMAP_NR(vaddr);
+
+ /*
+ * A count must never go down to zero
+ * without a TLB flush!
+ */
+ need_wakeup = 0;
+ switch (--pkmap_count[nr]) {
+ case 0:
+ BUG();
+ case 1:
+ /*
+ * Avoid an unnecessary wake_up() function call.
+ * The common case is pkmap_count[] == 1, but
+ * no waiters.
+ * The tasks queued in the wait-queue are guarded
+ * by both the lock in the wait-queue-head and by
+ * the kmap_lock. As the kmap_lock is held here,
+ * no need for the wait-queue-head's lock. Simply
+ * test if the queue is empty.
+ */
+ need_wakeup = waitqueue_active(&pkmap_map_wait);
+ }
+ unlock_kmap_any(flags);
+
+ /* do wake-up, if needed, race-free outside of the spin lock */
+ if (need_wakeup)
+ wake_up(&pkmap_map_wait);
+}
+
+EXPORT_SYMBOL(kunmap_high);
+#endif
+
+#if defined(HASHED_PAGE_VIRTUAL)
+
+#define PA_HASH_ORDER 7
+
+/*
+ * Describes one page->virtual association
+ */
+struct page_address_map {
+ struct page *page;
+ void *virtual;
+ struct list_head list;
+};
+
+static struct page_address_map page_address_maps[LAST_PKMAP];
+
+/*
+ * Hash table bucket
+ */
+static struct page_address_slot {
+ struct list_head lh; /* List of page_address_maps */
+ spinlock_t lock; /* Protect this bucket's list */
+} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
+
+static struct page_address_slot *page_slot(const struct page *page)
+{
+ return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
+}
+
+/**
+ * page_address - get the mapped virtual address of a page
+ * @page: &struct page to get the virtual address of
+ *
+ * Returns the page's virtual address.
+ */
+void *page_address(const struct page *page)
+{
+ unsigned long flags;
+ void *ret;
+ struct page_address_slot *pas;
+
+ if (!PageHighMem(page))
+ return lowmem_page_address(page);
+
+ pas = page_slot(page);
+ ret = NULL;
+ spin_lock_irqsave(&pas->lock, flags);
+ if (!list_empty(&pas->lh)) {
+ struct page_address_map *pam;
+
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ ret = pam->virtual;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock_irqrestore(&pas->lock, flags);
+ return ret;
+}
+
+EXPORT_SYMBOL(page_address);
+
+/**
+ * set_page_address - set a page's virtual address
+ * @page: &struct page to set
+ * @virtual: virtual address to use
+ */
+void set_page_address(struct page *page, void *virtual)
+{
+ unsigned long flags;
+ struct page_address_slot *pas;
+ struct page_address_map *pam;
+
+ BUG_ON(!PageHighMem(page));
+
+ pas = page_slot(page);
+ if (virtual) { /* Add */
+ pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
+ pam->page = page;
+ pam->virtual = virtual;
+
+ spin_lock_irqsave(&pas->lock, flags);
+ list_add_tail(&pam->list, &pas->lh);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ } else { /* Remove */
+ spin_lock_irqsave(&pas->lock, flags);
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ list_del(&pam->list);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ goto done;
+ }
+ }
+ spin_unlock_irqrestore(&pas->lock, flags);
+ }
+done:
+ return;
+}
+
+void __init page_address_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
+ INIT_LIST_HEAD(&page_address_htable[i].lh);
+ spin_lock_init(&page_address_htable[i].lock);
+ }
+}
+
+#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 00000000..e2f7f5aa
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2779 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/shrinker.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/migrate.h>
+#include <linux/hashtable.h>
+
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int khugepaged_slab_init(void);
+
+#define MM_SLOTS_HASH_BITS 10
+static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+};
+static struct khugepaged_scan khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+
+ if (!khugepaged_enabled())
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes)
+ min_free_kbytes = recommended_min;
+ setup_per_zone_wmarks();
+ return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
+static int start_khugepaged(void)
+{
+ int err = 0;
+ if (khugepaged_enabled()) {
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (unlikely(IS_ERR(khugepaged_thread))) {
+ printk(KERN_ERR
+ "khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+
+ if (!list_empty(&khugepaged_scan.mm_head))
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else if (khugepaged_thread) {
+ kthread_stop(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+
+ return err;
+}
+
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);
+
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!zero_page) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+ return 0;
+ }
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
+}
+
+static void put_huge_zero_page(void)
+{
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
+}
+
+static struct shrinker huge_zero_page_shrinker = {
+ .shrink = shrink_huge_zero_page,
+ .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef CONFIG_SYSFS
+
+static ssize_t double_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (test_bit(enabled, &transparent_hugepage_flags)) {
+ VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+ return sprintf(buf, "[always] madvise never\n");
+ } else if (test_bit(req_madv, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ set_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+ if (ret > 0) {
+ int err;
+
+ mutex_lock(&khugepaged_mutex);
+ err = start_khugepaged();
+ mutex_unlock(&khugepaged_mutex);
+
+ if (err)
+ ret = err;
+ }
+
+ return ret;
+}
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
+{
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
+}
+
+static ssize_t single_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag flag)
+{
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
+ set_bit(flag, &transparent_hugepage_flags);
+ else
+ clear_bit(flag, &transparent_hugepage_flags);
+
+ return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+ __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+static ssize_t use_zero_page_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+ __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+ __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+ &enabled_attr.attr,
+ &defrag_attr.attr,
+ &use_zero_page_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+ &debug_cow_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+ .attrs = hugepage_attr,
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = strict_strtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = strict_strtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+
+static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+ int err;
+
+ *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!*hugepage_kobj)) {
+ printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
+ return -ENOMEM;
+ }
+
+ err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
+ goto delete_obj;
+ }
+
+ err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
+ goto remove_hp_group;
+ }
+
+ return 0;
+
+remove_hp_group:
+ sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
+delete_obj:
+ kobject_put(*hugepage_kobj);
+ return err;
+}
+
+static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+ sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
+ sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
+ kobject_put(hugepage_kobj);
+}
+#else
+static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+ return 0;
+}
+
+static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+ struct kobject *hugepage_kobj;
+
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ return -EINVAL;
+ }
+
+ err = hugepage_init_sysfs(&hugepage_kobj);
+ if (err)
+ return err;
+
+ err = khugepaged_slab_init();
+ if (err)
+ goto out;
+
+ register_shrinker(&huge_zero_page_shrinker);
+
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
+ start_khugepaged();
+
+ return 0;
+out:
+ hugepage_exit_sysfs(hugepage_kobj);
+ return err;
+}
+module_init(hugepage_init)
+
+static int __init setup_transparent_hugepage(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING
+ "transparent_hugepage= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pmd = pmd_mkwrite(pmd);
+ return pmd;
+}
+
+static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+{
+ pmd_t entry;
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ return entry;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ struct page *page)
+{
+ pgtable_t pgtable;
+
+ VM_BUG_ON(!PageCompound(page));
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+
+ clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ __SetPageUptodate(page);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_none(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ pte_free(mm, pgtable);
+ } else {
+ pmd_t entry;
+ entry = mk_huge_pmd(page, vma);
+ /*
+ * The spinlocking to take the lru_lock inside
+ * page_add_new_anon_rmap() acts as a full memory
+ * barrier to be sure clear_huge_page writes become
+ * visible after the set_pmd_at() write.
+ */
+ page_add_new_anon_rmap(page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm->nr_ptes++;
+ spin_unlock(&mm->page_table_lock);
+ }
+
+ return 0;
+}
+
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+{
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr, int nd,
+ gfp_t extra_gfp)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+ HPAGE_PMD_ORDER, vma, haddr, nd);
+}
+
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+#endif
+
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
+{
+ pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ mm->nr_ptes++;
+ return true;
+}
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pte_t *pte;
+
+ if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ unsigned long zero_pfn;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+ spin_lock(&mm->page_table_lock);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
+ spin_unlock(&mm->page_table_lock);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
+ }
+ return 0;
+ }
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ goto out;
+ }
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
+ page))) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
+ }
+
+ return 0;
+ }
+out:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
+ return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ struct page *src_page;
+ pmd_t pmd;
+ pgtable_t pgtable;
+ int ret;
+
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+
+ spin_lock(&dst_mm->page_table_lock);
+ spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pmd = *src_pmd;
+ if (unlikely(!pmd_trans_huge(pmd))) {
+ pte_free(dst_mm, pgtable);
+ goto out_unlock;
+ }
+ /*
+ * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ unsigned long zero_pfn;
+ bool set;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
+ BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
+ ret = 0;
+ goto out_unlock;
+ }
+ if (unlikely(pmd_trans_splitting(pmd))) {
+ /* split huge page running from under us */
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+ pte_free(dst_mm, pgtable);
+
+ wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+ goto out;
+ }
+ src_page = pmd_page(pmd);
+ VM_BUG_ON(!PageHead(src_page));
+ get_page(src_page);
+ page_dup_rmap(src_page);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+ pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ pmd = pmd_mkold(pmd_wrprotect(pmd));
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ pgtable_trans_huge_deposit(dst_mm, pgtable);
+ dst_mm->nr_ptes++;
+
+ ret = 0;
+out_unlock:
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+out:
+ return ret;
+}
+
+void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+ pmd_t entry;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto unlock;
+
+ entry = pmd_mkyoung(orig_pmd);
+ haddr = address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+ update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+ spin_unlock(&mm->page_table_lock);
+}
+
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_page;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();
+ inc_mm_counter(mm, MM_ANONPAGES);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+out_free_page:
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ struct page *page,
+ unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int ret = 0, i;
+ struct page **pages;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+ __GFP_OTHER_NODE,
+ vma, address, page_to_nid(page));
+ if (unlikely(!pages[i] ||
+ mem_cgroup_newpage_charge(pages[i], mm,
+ GFP_KERNEL))) {
+ if (pages[i])
+ put_page(pages[i]);
+ mem_cgroup_uncharge_start();
+ while (--i >= 0) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ copy_user_highpage(pages[i], page + i,
+ haddr + PAGE_SIZE * i, vma);
+ __SetPageUptodate(pages[i]);
+ cond_resched();
+ }
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_pages;
+ VM_BUG_ON(!PageHead(page));
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(pages[i], vma, haddr);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ kfree(pages);
+
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ page_remove_rmap(page);
+ spin_unlock(&mm->page_table_lock);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+ int ret = 0;
+ struct page *page = NULL, *new_page;
+ unsigned long haddr;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_unlock;
+
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ if (page_mapcount(page) == 1) {
+ pmd_t entry;
+ entry = pmd_mkyoung(orig_pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
+ update_mmu_cache_pmd(vma, address, pmd);
+ ret |= VM_FAULT_WRITE;
+ goto out_unlock;
+ }
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+alloc:
+ if (transparent_hugepage_enabled(vma) &&
+ !transparent_hugepage_debug_cow())
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ else
+ new_page = NULL;
+
+ if (unlikely(!new_page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
+ address, pmd, orig_pmd, haddr);
+ } else {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
+ put_page(page);
+ }
+ goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ put_page(new_page);
+ if (page) {
+ split_huge_page(page);
+ put_page(page);
+ }
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ if (is_huge_zero_pmd(orig_pmd))
+ clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ else
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ __SetPageUptodate(new_page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ if (page)
+ put_page(page);
+ if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_page(new_page);
+ put_page(new_page);
+ goto out_mn;
+ } else {
+ pmd_t entry;
+ entry = mk_huge_pmd(new_page, vma);
+ pmdp_clear_flush(vma, haddr, pmd);
+ page_add_new_anon_rmap(new_page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, pmd);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ put_huge_zero_page();
+ } else {
+ VM_BUG_ON(!PageHead(page));
+ page_remove_rmap(page);
+ put_page(page);
+ }
+ ret |= VM_FAULT_WRITE;
+ }
+ spin_unlock(&mm->page_table_lock);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+ return ret;
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ goto out;
+
+ /* Avoid dumping huge zero page */
+ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
+ return ERR_PTR(-EFAULT);
+
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!PageHead(page));
+ if (flags & FOLL_TOUCH) {
+ pmd_t _pmd;
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but
+ * for now the dirty bit in the pmd is meaningless.
+ * And if the dirty bit will become meaningful and
+ * we'll only set it with FOLL_WRITE, an atomic
+ * set_bit will be required on the pmd to set the
+ * young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+ }
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain();
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON(!PageCompound(page));
+ if (flags & FOLL_GET)
+ get_page_foll(page);
+
+out:
+ return page;
+}
+
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target_nid;
+ int current_nid = -1;
+ bool migrated;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+ get_page(page);
+ current_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ put_page(page);
+ goto clear_pmdnuma;
+ }
+
+ /* Acquire the page lock to serialise THP migrations */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Migrate the THP to the requested node */
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr, page, target_nid);
+ if (!migrated)
+ goto check_same;
+
+ task_numa_fault(target_nid, HPAGE_PMD_NR, true);
+ return 0;
+
+check_same:
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+clear_pmdnuma:
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+ return 0;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+{
+ int ret = 0;
+
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t orig_pmd;
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ put_huge_zero_page();
+ } else {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ }
+ pte_free(tlb->mm, pgtable);
+ ret = 1;
+ }
+ return ret;
+}
+
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ int ret = 0;
+
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ /*
+ * All logical pages in the range are present
+ * if backed by a huge page.
+ */
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+ unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ int ret = 0;
+ pmd_t pmd;
+
+ struct mm_struct *mm = vma->vm_mm;
+
+ if ((old_addr & ~HPAGE_PMD_MASK) ||
+ (new_addr & ~HPAGE_PMD_MASK) ||
+ old_end - old_addr < HPAGE_PMD_SIZE ||
+ (new_vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd))) {
+ VM_BUG_ON(pmd_trans_huge(*new_pmd));
+ goto out;
+ }
+
+ ret = __pmd_trans_huge_lock(old_pmd, vma);
+ if (ret == 1) {
+ pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ VM_BUG_ON(!pmd_none(*new_pmd));
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ spin_unlock(&mm->page_table_lock);
+ }
+out:
+ return ret;
+}
+
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot, int prot_numa)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int ret = 0;
+
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+ pmd_t entry;
+ entry = pmdp_get_and_clear(mm, addr, pmd);
+ if (!prot_numa) {
+ entry = pmd_modify(entry, newprot);
+ BUG_ON(pmd_write(entry));
+ } else {
+ struct page *page = pmd_page(*pmd);
+
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
+ !pmd_numa(*pmd)) {
+ entry = pmd_mknuma(entry);
+ }
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+/*
+ * Returns 1 if a given pmd maps a stable (not under splitting) thp.
+ * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ *
+ * Note that if it returns 1, this routine returns without unlocking page
+ * table locks. So callers must unlock them.
+ */
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+{
+ spin_lock(&vma->vm_mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ return -1;
+ } else {
+ /* Thp mapped by 'pmd' is stable, so we can
+ * handle it as it is. */
+ return 1;
+ }
+ }
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+}
+
+pmd_t *page_check_address_pmd(struct page *page,
+ struct mm_struct *mm,
+ unsigned long address,
+ enum page_check_address_pmd_flag flag)
+{
+ pmd_t *pmd, *ret = NULL;
+
+ if (address & ~HPAGE_PMD_MASK)
+ goto out;
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ goto out;
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_page(*pmd) != page)
+ goto out;
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto out;
+ if (pmd_trans_huge(*pmd)) {
+ VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+ !pmd_trans_splitting(*pmd));
+ ret = pmd;
+ }
+out:
+ return ret;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ int ret = 0;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = address;
+ const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
+
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ if (pmd) {
+ /*
+ * We can't temporarily set the pmd to null in order
+ * to split it, the pmd must remain marked huge at all
+ * times or the VM won't take the pmd_trans_huge paths
+ * and it won't wait on the anon_vma->root->rwsem to
+ * serialize against split_huge_page*.
+ */
+ pmdp_splitting_flush(vma, address, pmd);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page)
+{
+ int i;
+ struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
+ int tail_count = 0;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ compound_lock(page);
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(page);
+
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ struct page *page_tail = page + i;
+
+ /* tail_page->_mapcount cannot change */
+ BUG_ON(page_mapcount(page_tail) < 0);
+ tail_count += page_mapcount(page_tail);
+ /* check for overflow */
+ BUG_ON(tail_count < 0);
+ BUG_ON(atomic_read(&page_tail->_count) != 0);
+ /*
+ * tail_page->_count is zero and not changing from
+ * under us. But get_page_unless_zero() may be running
+ * from under us on the tail_page. If we used
+ * atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is
+ * implemented in C not using locked ops. spin_unlock
+ * on x86 sometime uses locked ops because of PPro
+ * errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and
+ * not only on x86), it's safer to use atomic_add().
+ */
+ atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+ &page_tail->_count);
+
+ /* after clearing PageTail the gup refcount can be released */
+ smp_mb();
+
+ /*
+ * retain hwpoison flag of the poisoned tail page:
+ * fix for the unsuitable process killed on Guest Machine(KVM)
+ * by the memory-failure.
+ */
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /* clear PageTail before overwriting first_page */
+ smp_wmb();
+
+ /*
+ * __split_huge_page_splitting() already set the
+ * splitting bit in all pmd that could map this
+ * hugepage, that will ensure no CPU can alter the
+ * mapcount on the head page. The mapcount is only
+ * accounted in the head page and it has to be
+ * transferred to all tail pages in the below code. So
+ * for this code to be safe, the split the mapcount
+ * can't change. But that doesn't mean userland can't
+ * keep changing and reading the page contents while
+ * we transfer the mapcount, so the pmd splitting
+ * status is achieved setting a reserved bit in the
+ * pmd, not by clearing the present bit.
+ */
+ page_tail->_mapcount = page->_mapcount;
+
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;
+
+ page_tail->index = page->index + i;
+ page_nid_xchg_last(page_tail, page_nid_last(page));
+
+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+
+ lru_add_page_tail(page, page_tail, lruvec);
+ }
+ atomic_sub(tail_count, &page->_count);
+ BUG_ON(atomic_read(&page->_count) <= 0);
+
+ __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+
+ ClearPageCompound(page);
+ compound_unlock(page);
+ spin_unlock_irq(&zone->lru_lock);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ }
+
+ /*
+ * Only the head page (now become a regular page) is required
+ * to be pinned by the caller.
+ */
+ BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd, _pmd;
+ int ret = 0, i;
+ pgtable_t pgtable;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ if (pmd) {
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ haddr = address;
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!pmd_write(*pmd))
+ entry = pte_wrprotect(entry);
+ else
+ BUG_ON(page_mapcount(page) != 1);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ if (pmd_numa(*pmd))
+ entry = pte_mknuma(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+
+ smp_wmb(); /* make pte visible before pmd */
+ /*
+ * Up to this point the pmd is present and huge and
+ * userland has the whole access to the hugepage
+ * during the split (which happens in place). If we
+ * overwrite the pmd with the not-huge version
+ * pointing to the pte here (which of course we could
+ * if all CPUs were bug free), userland could trigger
+ * a small page size TLB miss on the small sized TLB
+ * while the hugepage TLB entry is still established
+ * in the huge TLB. Some CPU doesn't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+ * Erratum 383 on page 93. Intel should be safe but is
+ * also warns that it's only safe if the permission
+ * and cache attributes of the two entries loaded in
+ * the two TLB is identical (which should be the case
+ * here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual
+ * address to be loaded simultaneously. So instead of
+ * doing "pmd_populate(); flush_tlb_range();" we first
+ * mark the current pmd notpresent (atomically because
+ * here the pmd_trans_huge and pmd_trans_splitting
+ * must remain set at all times on the pmd until the
+ * split is complete for this pmd), then we flush the
+ * SMP TLB and finally we write the non-huge version
+ * of the pmd entry with pmd_populate.
+ */
+ pmdp_invalidate(vma, address, pmd);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+/* must be called with anon_vma->root->rwsem held */
+static void __split_huge_page(struct page *page,
+ struct anon_vma *anon_vma)
+{
+ int mapcount, mapcount2;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct anon_vma_chain *avc;
+
+ BUG_ON(!PageHead(page));
+ BUG_ON(PageTail(page));
+
+ mapcount = 0;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ mapcount += __split_huge_page_splitting(page, vma, addr);
+ }
+ /*
+ * It is critical that new vmas are added to the tail of the
+ * anon_vma list. This guarantes that if copy_huge_pmd() runs
+ * and establishes a child pmd before
+ * __split_huge_page_splitting() freezes the parent pmd (so if
+ * we fail to prevent copy_huge_pmd() from running until the
+ * whole __split_huge_page() is complete), we will still see
+ * the newly established pmd of the child later during the
+ * walk, to be able to set it as pmd_trans_splitting too.
+ */
+ if (mapcount != page_mapcount(page))
+ printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG_ON(mapcount != page_mapcount(page));
+
+ __split_huge_page_refcount(page);
+
+ mapcount2 = 0;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ mapcount2 += __split_huge_page_map(page, vma, addr);
+ }
+ if (mapcount != mapcount2)
+ printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG_ON(mapcount != mapcount2);
+}
+
+int split_huge_page(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ int ret = 1;
+
+ BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
+ BUG_ON(!PageAnon(page));
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(page);
+ if (!anon_vma)
+ goto out;
+ anon_vma_lock_write(anon_vma);
+
+ ret = 0;
+ if (!PageCompound(page))
+ goto out_unlock;
+
+ BUG_ON(!PageSwapBacked(page));
+ __split_huge_page(page, anon_vma);
+ count_vm_event(THP_SPLIT);
+
+ BUG_ON(PageCompound(page));
+out_unlock:
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+out:
+ return ret;
+}
+
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ switch (advice) {
+ case MADV_HUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ if (mm->def_flags & VM_NOHUGEPAGE)
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+static int __init khugepaged_slab_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+
+ hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
+ if (mm == mm_slot->mm)
+ return mm_slot;
+
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ mm_slot->mm = mm;
+ hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON(khugepaged_test_exit(mm));
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ atomic_inc(&mm->mm_count);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+ unsigned long hstart, hend;
+ if (!vma->anon_vma)
+ /*
+ * Not yet faulted in so we will register later in the
+ * page fault if needed.
+ */
+ return 0;
+ if (vma->vm_ops)
+ /* khugepaged not yet working on file or special mappings */
+ return 0;
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ if (free) {
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_sem.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+}
+
+static void release_pte_page(struct page *page)
+{
+ /* 0 stands for page_is_file_cache(page) == false */
+ dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+ if (!pte_none(pteval))
+ release_pte_page(pte_page(pteval));
+ }
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte)
+{
+ struct page *page;
+ pte_t *_pte;
+ int referenced = 0, none = 0;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out;
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page))
+ goto out;
+
+ VM_BUG_ON(PageCompound(page));
+ BUG_ON(!PageAnon(page));
+ VM_BUG_ON(!PageSwapBacked(page));
+
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out;
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page))
+ goto out;
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ /* 0 stands for page_is_file_cache(page) == false */
+ inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
+
+ /* If there is no mapped pte young don't collapse the page */
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (likely(referenced))
+ return 1;
+out:
+ release_pte_pages(pte, _pte);
+ return 0;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl)
+{
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ pte_t pteval = *_pte;
+ struct page *src_page;
+
+ if (pte_none(pteval)) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ VM_BUG_ON(page_mapcount(src_page) != 1);
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+
+ address += PAGE_SIZE;
+ page++;
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
+
+#ifdef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (IS_ERR(*hpage)) {
+ if (!*wait)
+ return false;
+
+ *wait = false;
+ *hpage = NULL;
+ khugepaged_alloc_sleep();
+ } else if (*hpage) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
+ VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ node, __GFP_OTHER_NODE);
+
+ /*
+ * After allocating the hugepage, release the mmap_sem read lock in
+ * preparation for taking it in write mode.
+ */
+ up_read(&mm->mmap_sem);
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ *hpage = ERR_PTR(-ENOMEM);
+ return NULL;
+ }
+
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ return *hpage;
+}
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (!*wait)
+ return NULL;
+
+ *wait = false;
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+ return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (!*hpage)
+ *hpage = khugepaged_alloc_hugepage(wait);
+
+ if (unlikely(!*hpage))
+ return false;
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
+ up_read(&mm->mmap_sem);
+ VM_BUG_ON(!*hpage);
+ return *hpage;
+}
+#endif
+
+static bool hugepage_vma_check(struct vm_area_struct *vma)
+{
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ return false;
+
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ return true;
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma,
+ int node)
+{
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *ptl;
+ int isolated;
+ unsigned long hstart, hend;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ /* release the mmap_sem read lock. */
+ new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+ if (!new_page)
+ return;
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+ return;
+
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later hanlded by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ vma = find_vma(mm, address);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ goto out;
+ if (!hugepage_vma_check(vma))
+ goto out;
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ goto out;
+ if (pmd_trans_huge(*pmd))
+ goto out;
+
+ anon_vma_lock_write(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, address);
+ ptl = pte_lockptr(mm, pmd);
+
+ mmun_start = address;
+ mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+ _pmd = pmdp_clear_flush(vma, address, pmd);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ spin_lock(ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte);
+ spin_unlock(ptl);
+
+ if (unlikely(!isolated)) {
+ pte_unmap(pte);
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd_at(mm, address, pmd, _pmd);
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock_write(vma->anon_vma);
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock_write(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ pte_unmap(pte);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(new_page, vma);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache_pmd(vma, address, pmd);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ spin_unlock(&mm->page_table_lock);
+
+ *hpage = NULL;
+
+ khugepaged_pages_collapsed++;
+out_up_write:
+ up_write(&mm->mmap_sem);
+ return;
+
+out:
+ mem_cgroup_uncharge_page(new_page);
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, referenced = 0, none = 0;
+ struct page *page;
+ unsigned long _address;
+ spinlock_t *ptl;
+ int node = NUMA_NO_NODE;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ goto out;
+ if (pmd_trans_huge(*pmd))
+ goto out;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out_unmap;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out_unmap;
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page))
+ goto out_unmap;
+ /*
+ * Chose the node of the first page. This could
+ * be more sophisticated and look at more pages,
+ * but isn't for now.
+ */
+ if (node == NUMA_NO_NODE)
+ node = page_to_nid(page);
+ VM_BUG_ON(PageCompound(page));
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ goto out_unmap;
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out_unmap;
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (referenced)
+ ret = 1;
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma, node);
+out:
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+ __releases(&khugepaged_mm_lock)
+ __acquires(&khugepaged_mm_lock)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+ if (!hugepage_vma_check(vma)) {
+skip:
+ progress++;
+ continue;
+ }
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ goto breakouterloop_mmap_sem;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+ spin_lock(&khugepaged_mm_lock);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ kthread_should_stop();
+}
+
+static void khugepaged_do_scan(void)
+{
+ struct page *hpage = NULL;
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+ bool wait = true;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ while (progress < pages) {
+ if (!khugepaged_prealloc_page(&hpage, &wait))
+ break;
+
+ cond_resched();
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ &hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+
+ if (!IS_ERR_OR_NULL(hpage))
+ put_page(hpage);
+}
+
+static void khugepaged_wait_work(void)
+{
+ try_to_freeze();
+
+ if (khugepaged_has_work()) {
+ if (!khugepaged_scan_sleep_millisecs)
+ return;
+
+ wait_event_freezable_timeout(khugepaged_wait,
+ kthread_should_stop(),
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+ return;
+ }
+
+ if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, 19);
+
+ while (!kthread_should_stop()) {
+ khugepaged_do_scan();
+ khugepaged_wait_work();
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int i;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ put_huge_zero_page();
+}
+
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd)
+{
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ if (is_huge_zero_pmd(*pmd)) {
+ __split_huge_zero_page_pmd(vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!page_count(page));
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ split_huge_page(page);
+
+ put_page(page);
+ BUG_ON(pmd_trans_huge(*pmd));
+}
+
+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, address);
+ BUG_ON(vma == NULL);
+ split_huge_page_pmd(vma, address, pmd);
+}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd_mm(mm, address, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 00000000..0a0be33b
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,3184 @@
+/*
+ * Generic hugetlb support.
+ * (C) Nadia Yvette Chambers, April 2004
+ */
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/sysctl.h>
+#include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
+#include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/bootmem.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+
+#include <linux/io.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+#include <linux/node.h>
+#include "internal.h"
+
+const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
+unsigned long hugepages_treat_as_movable;
+
+int hugetlb_max_hstate __read_mostly;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+
+__initdata LIST_HEAD(huge_boot_pages);
+
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
+
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
+DEFINE_SPINLOCK(hugetlb_lock);
+
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, free the subpool the subpool remain */
+ if (free)
+ kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = nr_blocks;
+ spool->used_hpages = 0;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ int ret = 0;
+
+ if (!spool)
+ return 0;
+
+ spin_lock(&spool->lock);
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
+ spool->used_hpages += delta;
+ } else {
+ ret = -ENOMEM;
+ }
+ spin_unlock(&spool->lock);
+
+ return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ if (!spool)
+ return;
+
+ spin_lock(&spool->lock);
+ spool->used_hpages -= delta;
+ /* If hugetlbfs_put_super couldn't free spool due to
+ * an outstanding quota reference, free it now. */
+ unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(file_inode(vma->vm_file));
+}
+
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ * across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex. To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ * down_write(&mm->mmap_sem);
+ * or
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
+ */
+struct file_region {
+ struct list_head link;
+ long from;
+ long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg, *trg;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+
+ /* Check for and consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ break;
+
+ /* If this area reaches higher then extend our area to
+ * include it completely. If this is not the first area
+ * which we intend to reuse, free it. */
+ if (rg->to > t)
+ t = rg->to;
+ if (rg != nrg) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+ nrg->from = f;
+ nrg->to = t;
+ return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+ struct file_region *rg, *nrg;
+ long chg = 0;
+
+ /* Locate the region we are before or in. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* If we are below the current region then a new region is required.
+ * Subtle, allocate a new region at the position but make it zero
+ * size such that we can guarantee to record the reservation. */
+ if (&rg->link == head || t < rg->from) {
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ nrg->from = f;
+ nrg->to = f;
+ INIT_LIST_HEAD(&nrg->link);
+ list_add(&nrg->link, rg->link.prev);
+
+ return t - f;
+ }
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+ chg = t - f;
+
+ /* Check for and consume any regions we now overlap with. */
+ list_for_each_entry(rg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ return chg;
+
+ /* We overlap with this area, if it extends further than
+ * us then we must extend ourselves. Account for its
+ * existing reservation. */
+ if (rg->to > t) {
+ chg += rg->to - t;
+ t = rg->to;
+ }
+ chg -= rg->to - rg->from;
+ }
+ return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+ struct file_region *rg, *trg;
+ long chg = 0;
+
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (end <= rg->to)
+ break;
+ if (&rg->link == head)
+ return 0;
+
+ /* If we are in the middle of a region then adjust it. */
+ if (end > rg->from) {
+ chg = rg->to - end;
+ rg->to = end;
+ rg = list_entry(rg->link.next, typeof(*rg), link);
+ }
+
+ /* Drop any remaining regions. */
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ chg += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return chg;
+}
+
+static long region_count(struct list_head *head, long f, long t)
+{
+ struct file_region *rg;
+ long chg = 0;
+
+ /* Locate each segment we overlap with, and count that overlap. */
+ list_for_each_entry(rg, head, link) {
+ long seg_from;
+ long seg_to;
+
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
+ break;
+
+ seg_from = max(rg->from, f);
+ seg_to = min(rg->to, t);
+
+ chg += seg_to - seg_from;
+ }
+
+ return chg;
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return ((address - vma->vm_start) >> huge_page_shift(h)) +
+ (vma->vm_pgoff >> huge_page_order(h));
+}
+
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
+/*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+ struct hstate *hstate;
+
+ if (!is_vm_hugetlb_page(vma))
+ return PAGE_SIZE;
+
+ hstate = hstate_vma(vma);
+
+ return 1UL << (hstate->order + PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
+
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific version of this
+ * function is required.
+ */
+#ifndef vma_mmu_pagesize
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ return vma_kernel_pagesize(vma);
+}
+#endif
+
+/*
+ * Flags for MAP_PRIVATE reservations. These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping. A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated. A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+ return (unsigned long)vma->vm_private_data;
+}
+
+static void set_vma_private_data(struct vm_area_struct *vma,
+ unsigned long value)
+{
+ vma->vm_private_data = (void *)value;
+}
+
+struct resv_map {
+ struct kref refs;
+ struct list_head regions;
+};
+
+static struct resv_map *resv_map_alloc(void)
+{
+ struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+ if (!resv_map)
+ return NULL;
+
+ kref_init(&resv_map->refs);
+ INIT_LIST_HEAD(&resv_map->regions);
+
+ return resv_map;
+}
+
+static void resv_map_release(struct kref *ref)
+{
+ struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+
+ /* Clear out any active regions before we release the map. */
+ region_truncate(&resv_map->regions, 0);
+ kfree(resv_map);
+}
+
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return (struct resv_map *)(get_vma_private_data(vma) &
+ ~HPAGE_RESV_MASK);
+ return NULL;
+}
+
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+
+ set_vma_private_data(vma, (get_vma_private_data(vma) &
+ HPAGE_RESV_MASK) | (unsigned long)map);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+
+ set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+
+ return (get_vma_private_data(vma) & flag) != 0;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct hstate *h,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_NORESERVE)
+ return;
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /* Shared mappings always use reserves */
+ h->resv_huge_pages--;
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
+ h->resv_huge_pages--;
+ }
+}
+
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_reserves(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_MAYSHARE)
+ return 1;
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ return 1;
+ return 0;
+}
+
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
+static void enqueue_huge_page(struct hstate *h, struct page *page)
+{
+ int nid = page_to_nid(page);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
+ h->free_huge_pages++;
+ h->free_huge_pages_node[nid]++;
+}
+
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ if (list_empty(&h->hugepage_freelists[nid]))
+ return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+ list_move(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+}
+
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long address, int avoid_reserve)
+{
+ struct page *page = NULL;
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask, &mpol, &nodemask);
+ /*
+ * A child process with MAP_PRIVATE mappings created by their parent
+ * have no page reserves. This check ensures that reservations are
+ * not "stolen". The child may still get SIGKILLed
+ */
+ if (!vma_has_reserves(vma) &&
+ h->free_huge_pages - h->resv_huge_pages == 0)
+ goto err;
+
+ /* If reserves cannot be used, ensure enough pages are in the pool */
+ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ goto err;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ MAX_NR_ZONES - 1, nodemask) {
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
+ if (page) {
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+ break;
+ }
+ }
+ }
+
+ mpol_cond_put(mpol);
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+ return page;
+
+err:
+ mpol_cond_put(mpol);
+ return NULL;
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page)
+{
+ int i;
+
+ VM_BUG_ON(h->order >= MAX_ORDER);
+
+ h->nr_huge_pages--;
+ h->nr_huge_pages_node[page_to_nid(page)]--;
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
+ }
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
+ set_compound_page_dtor(page, NULL);
+ set_page_refcounted(page);
+ arch_release_hugepage(page);
+ __free_pages(page, huge_page_order(h));
+}
+
+struct hstate *size_to_hstate(unsigned long size)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ if (huge_page_size(h) == size)
+ return h;
+ }
+ return NULL;
+}
+
+static void free_huge_page(struct page *page)
+{
+ /*
+ * Can't pass hstate in here because it is called from the
+ * compound page destructor.
+ */
+ struct hstate *h = page_hstate(page);
+ int nid = page_to_nid(page);
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
+ BUG_ON(page_count(page));
+ BUG_ON(page_mapcount(page));
+
+ spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
+ if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ /* remove the page from active list */
+ list_del(&page->lru);
+ update_and_free_page(h, page);
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ } else {
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+ }
+ spin_unlock(&hugetlb_lock);
+ hugepage_subpool_put_pages(spool, 1);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ INIT_LIST_HEAD(&page->lru);
+ set_compound_page_dtor(page, free_huge_page);
+ spin_lock(&hugetlb_lock);
+ set_hugetlb_cgroup(page, NULL);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+ spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
+}
+
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ /* we rely on prep_new_huge_page to set the destructor */
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __SetPageTail(p);
+ set_page_count(p, 0);
+ p->first_page = page;
+ }
+}
+
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages. See the PageTransHuge() documentation for more
+ * details.
+ */
+int PageHuge(struct page *page)
+{
+ compound_page_dtor *dtor;
+
+ if (!PageCompound(page))
+ return 0;
+
+ page = compound_head(page);
+ dtor = get_compound_page_dtor(page);
+
+ return dtor == free_huge_page;
+}
+EXPORT_SYMBOL_GPL(PageHuge);
+
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ if (h->order >= MAX_ORDER)
+ return NULL;
+
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ if (page) {
+ if (arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ return NULL;
+ }
+ prep_new_huge_page(h, page, nid);
+ }
+
+ return page;
+}
+
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node(nid, *nodes_allowed);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(*nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ struct page *page;
+ int start_nid;
+ int next_nid;
+ int ret = 0;
+
+ start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+ next_nid = start_nid;
+
+ do {
+ page = alloc_fresh_huge_page_node(h, next_nid);
+ if (page) {
+ ret = 1;
+ break;
+ }
+ next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+ } while (next_nid != start_nid);
+
+ if (ret)
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return ret;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ bool acct_surplus)
+{
+ int start_nid;
+ int next_nid;
+ int ret = 0;
+
+ start_nid = hstate_next_node_to_free(h, nodes_allowed);
+ next_nid = start_nid;
+
+ do {
+ /*
+ * If we're returning unused surplus pages, only examine
+ * nodes with surplus pages.
+ */
+ if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+ !list_empty(&h->hugepage_freelists[next_nid])) {
+ struct page *page =
+ list_entry(h->hugepage_freelists[next_nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[next_nid]--;
+ if (acct_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[next_nid]--;
+ }
+ update_and_free_page(h, page);
+ ret = 1;
+ break;
+ }
+ next_nid = hstate_next_node_to_free(h, nodes_allowed);
+ } while (next_nid != start_nid);
+
+ return ret;
+}
+
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+{
+ struct page *page;
+ unsigned int r_nid;
+
+ if (h->order >= MAX_ORDER)
+ return NULL;
+
+ /*
+ * Assume we will successfully allocate the surplus page to
+ * prevent racing processes from causing the surplus to exceed
+ * overcommit
+ *
+ * This however introduces a different race, where a process B
+ * tries to grow the static hugepage pool while alloc_pages() is
+ * called by process A. B will only examine the per-node
+ * counters in determining if surplus huge pages can be
+ * converted to normal huge pages in adjust_pool_surplus(). A
+ * won't be able to increment the per-node counter, until the
+ * lock is dropped by B, but B doesn't drop hugetlb_lock until
+ * no more huge pages can be converted from surplus to normal
+ * state (and doesn't try to convert again). Thus, we have a
+ * case where a surplus huge page exists, the pool is grown, and
+ * the surplus huge page still exists after, even though it
+ * should just have been converted to a normal huge page. This
+ * does not leak memory, though, as the hugepage will be freed
+ * once it is out of use. It also does not allow the counters to
+ * go out of whack in adjust_pool_surplus() as we don't modify
+ * the node values until we've gotten the hugepage and only the
+ * per-node value is checked there.
+ */
+ spin_lock(&hugetlb_lock);
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
+ spin_unlock(&hugetlb_lock);
+ return NULL;
+ } else {
+ h->nr_huge_pages++;
+ h->surplus_huge_pages++;
+ }
+ spin_unlock(&hugetlb_lock);
+
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ else
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+
+ if (page && arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ page = NULL;
+ }
+
+ spin_lock(&hugetlb_lock);
+ if (page) {
+ INIT_LIST_HEAD(&page->lru);
+ r_nid = page_to_nid(page);
+ set_compound_page_dtor(page, free_huge_page);
+ set_hugetlb_cgroup(page, NULL);
+ /*
+ * We incremented the global counters already
+ */
+ h->nr_huge_pages_node[r_nid]++;
+ h->surplus_huge_pages_node[r_nid]++;
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ } else {
+ h->nr_huge_pages--;
+ h->surplus_huge_pages--;
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ }
+ spin_unlock(&hugetlb_lock);
+
+ return page;
+}
+
+/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_node(h, nid);
+ spin_unlock(&hugetlb_lock);
+
+ if (!page)
+ page = alloc_buddy_huge_page(h, nid);
+
+ return page;
+}
+
+/*
+ * Increase the hugetlb pool such that it can accommodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(struct hstate *h, int delta)
+{
+ struct list_head surplus_list;
+ struct page *page, *tmp;
+ int ret, i;
+ int needed, allocated;
+ bool alloc_ok = true;
+
+ needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
+ if (needed <= 0) {
+ h->resv_huge_pages += delta;
+ return 0;
+ }
+
+ allocated = 0;
+ INIT_LIST_HEAD(&surplus_list);
+
+ ret = -ENOMEM;
+retry:
+ spin_unlock(&hugetlb_lock);
+ for (i = 0; i < needed; i++) {
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page) {
+ alloc_ok = false;
+ break;
+ }
+ list_add(&page->lru, &surplus_list);
+ }
+ allocated += i;
+
+ /*
+ * After retaking hugetlb_lock, we need to recalculate 'needed'
+ * because either resv_huge_pages or free_huge_pages may have changed.
+ */
+ spin_lock(&hugetlb_lock);
+ needed = (h->resv_huge_pages + delta) -
+ (h->free_huge_pages + allocated);
+ if (needed > 0) {
+ if (alloc_ok)
+ goto retry;
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ goto free;
+ }
+ /*
+ * The surplus_list now contains _at_least_ the number of extra pages
+ * needed to accommodate the reservation. Add the appropriate number
+ * of pages to the hugetlb pool and free the extras back to the buddy
+ * allocator. Commit the entire reservation here to prevent another
+ * process from stealing the pages as they are added to the pool but
+ * before they are reserved.
+ */
+ needed += allocated;
+ h->resv_huge_pages += delta;
+ ret = 0;
+
+ /* Free the needed pages to the hugetlb pool */
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ if ((--needed) < 0)
+ break;
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON(page_count(page));
+ enqueue_huge_page(h, page);
+ }
+free:
+ spin_unlock(&hugetlb_lock);
+
+ /* Free unnecessary surplus pages to the buddy allocator */
+ if (!list_empty(&surplus_list)) {
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ put_page(page);
+ }
+ }
+ spin_lock(&hugetlb_lock);
+
+ return ret;
+}
+
+/*
+ * When releasing a hugetlb pool reservation, any surplus pages that were
+ * allocated to satisfy the reservation must be explicitly freed if they were
+ * never used.
+ * Called with hugetlb_lock held.
+ */
+static void return_unused_surplus_pages(struct hstate *h,
+ unsigned long unused_resv_pages)
+{
+ unsigned long nr_pages;
+
+ /* Uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
+
+ /* Cannot return gigantic pages currently */
+ if (h->order >= MAX_ORDER)
+ return;
+
+ nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
+
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes with memory. Iterate across these nodes
+ * until we can no longer free unreserved surplus pages. This occurs
+ * when the nodes with surplus pages have no free pages.
+ * free_pool_huge_page() will balance the the freed pages across the
+ * on-line nodes with memory and will handle the hstate accounting.
+ */
+ while (nr_pages--) {
+ if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+ break;
+ }
+}
+
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation. Where it does not we will need to logically increase
+ * reservation and actually increase subpool usage before an allocation
+ * can occur. Where any new reservation would be required the
+ * reservation change is prepared, but not committed. Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation. No action is required on
+ * failure.
+ */
+static long vma_needs_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ return region_chg(&inode->i_mapping->private_list,
+ idx, idx + 1);
+
+ } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ return 1;
+
+ } else {
+ long err;
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ err = region_chg(&reservations->regions, idx, idx + 1);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+}
+static void vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ region_add(&inode->i_mapping->private_list, idx, idx + 1);
+
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /* Mark this page used in the map. */
+ region_add(&reservations->regions, idx, idx + 1);
+ }
+}
+
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+{
+ struct hugepage_subpool *spool = subpool_vma(vma);
+ struct hstate *h = hstate_vma(vma);
+ struct page *page;
+ long chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+
+ idx = hstate_index(h);
+ /*
+ * Processes that did not create the mapping will have no
+ * reserves and will not have accounted against subpool
+ * limit. Check that the subpool limit can be made before
+ * satisfying the allocation MAP_NORESERVE mappings may also
+ * need pages and subpool limit allocated allocated if no reserve
+ * mapping overlaps.
+ */
+ chg = vma_needs_reservation(h, vma, addr);
+ if (chg < 0)
+ return ERR_PTR(-ENOMEM);
+ if (chg)
+ if (hugepage_subpool_get_pages(spool, chg))
+ return ERR_PTR(-ENOSPC);
+
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret) {
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
+ if (page) {
+ /* update page cgroup details */
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ spin_unlock(&hugetlb_lock);
+ } else {
+ spin_unlock(&hugetlb_lock);
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page) {
+ hugetlb_cgroup_uncharge_cgroup(idx,
+ pages_per_huge_page(h),
+ h_cg);
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
+ spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ list_move(&page->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ }
+
+ set_page_private(page, (unsigned long)spool);
+
+ vma_commit_reservation(h, vma, addr);
+ return page;
+}
+
+int __weak alloc_bootmem_huge_page(struct hstate *h)
+{
+ struct huge_bootmem_page *m;
+ int nr_nodes = nodes_weight(node_states[N_MEMORY]);
+
+ while (nr_nodes) {
+ void *addr;
+
+ addr = __alloc_bootmem_node_nopanic(
+ NODE_DATA(hstate_next_node_to_alloc(h,
+ &node_states[N_MEMORY])),
+ huge_page_size(h), huge_page_size(h), 0);
+
+ if (addr) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ m = addr;
+ goto found;
+ }
+ nr_nodes--;
+ }
+ return 0;
+
+found:
+ BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+ /* Put them into a private list first because mem_map is not up yet */
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = h;
+ return 1;
+}
+
+static void prep_compound_huge_page(struct page *page, int order)
+{
+ if (unlikely(order > (MAX_ORDER - 1)))
+ prep_compound_gigantic_page(page, order);
+ else
+ prep_compound_page(page, order);
+}
+
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
+{
+ struct huge_bootmem_page *m;
+
+ list_for_each_entry(m, &huge_boot_pages, list) {
+ struct hstate *h = m->hstate;
+ struct page *page;
+
+#ifdef CONFIG_HIGHMEM
+ page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
+#else
+ page = virt_to_page(m);
+#endif
+ __ClearPageReserved(page);
+ WARN_ON(page_count(page) != 1);
+ prep_compound_huge_page(page, h->order);
+ prep_new_huge_page(h, page, page_to_nid(page));
+ /*
+ * If we had gigantic hugepages allocated at boot time, we need
+ * to restore the 'stolen' pages to totalram_pages in order to
+ * fix confusing memory reports from free(1) and another
+ * side-effects, like CommitLimit going negative.
+ */
+ if (h->order > (MAX_ORDER - 1))
+ totalram_pages += 1 << h->order;
+ }
+}
+
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+ unsigned long i;
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ if (h->order >= MAX_ORDER) {
+ if (!alloc_bootmem_huge_page(h))
+ break;
+ } else if (!alloc_fresh_huge_page(h,
+ &node_states[N_MEMORY]))
+ break;
+ }
+ h->max_huge_pages = i;
+}
+
+static void __init hugetlb_init_hstates(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /* oversize hugepages were init'ed in early boot */
+ if (h->order < MAX_ORDER)
+ hugetlb_hstate_alloc_pages(h);
+ }
+}
+
+static char * __init memfmt(char *buf, unsigned long n)
+{
+ if (n >= (1UL << 30))
+ sprintf(buf, "%lu GB", n >> 30);
+ else if (n >= (1UL << 20))
+ sprintf(buf, "%lu MB", n >> 20);
+ else
+ sprintf(buf, "%lu KB", n >> 10);
+ return buf;
+}
+
+static void __init report_hugepages(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ char buf[32];
+ pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+ memfmt(buf, huge_page_size(h)),
+ h->free_huge_pages);
+ }
+}
+
+#ifdef CONFIG_HIGHMEM
+static void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
+{
+ int i;
+
+ if (h->order >= MAX_ORDER)
+ return;
+
+ for_each_node_mask(i, *nodes_allowed) {
+ struct page *page, *next;
+ struct list_head *freel = &h->hugepage_freelists[i];
+ list_for_each_entry_safe(page, next, freel, lru) {
+ if (count >= h->nr_huge_pages)
+ return;
+ if (PageHighMem(page))
+ continue;
+ list_del(&page->lru);
+ update_and_free_page(h, page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[page_to_nid(page)]--;
+ }
+ }
+}
+#else
+static inline void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
+{
+}
+#endif
+
+/*
+ * Increment or decrement surplus_huge_pages. Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
+ int delta)
+{
+ int start_nid, next_nid;
+ int ret = 0;
+
+ VM_BUG_ON(delta != -1 && delta != 1);
+
+ if (delta < 0)
+ start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+ else
+ start_nid = hstate_next_node_to_free(h, nodes_allowed);
+ next_nid = start_nid;
+
+ do {
+ int nid = next_nid;
+ if (delta < 0) {
+ /*
+ * To shrink on this node, there must be a surplus page
+ */
+ if (!h->surplus_huge_pages_node[nid]) {
+ next_nid = hstate_next_node_to_alloc(h,
+ nodes_allowed);
+ continue;
+ }
+ }
+ if (delta > 0) {
+ /*
+ * Surplus cannot exceed the total number of pages
+ */
+ if (h->surplus_huge_pages_node[nid] >=
+ h->nr_huge_pages_node[nid]) {
+ next_nid = hstate_next_node_to_free(h,
+ nodes_allowed);
+ continue;
+ }
+ }
+
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[nid] += delta;
+ ret = 1;
+ break;
+ } while (next_nid != start_nid);
+
+ return ret;
+}
+
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
+{
+ unsigned long min_count, ret;
+
+ if (h->order >= MAX_ORDER)
+ return h->max_huge_pages;
+
+ /*
+ * Increase the pool size
+ * First take pages out of surplus state. Then make up the
+ * remaining difference by allocating fresh huge pages.
+ *
+ * We might race with alloc_buddy_huge_page() here and be unable
+ * to convert a surplus huge page to a normal huge page. That is
+ * not critical, though, it just means the overall size of the
+ * pool might be one hugepage larger than it needs to be, but
+ * within all the constraints specified by the sysctls.
+ */
+ spin_lock(&hugetlb_lock);
+ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, nodes_allowed, -1))
+ break;
+ }
+
+ while (count > persistent_huge_pages(h)) {
+ /*
+ * If this allocation races such that we no longer need the
+ * page, free_huge_page will handle it by freeing the page
+ * and reducing the surplus.
+ */
+ spin_unlock(&hugetlb_lock);
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
+ spin_lock(&hugetlb_lock);
+ if (!ret)
+ goto out;
+
+ /* Bail for signals. Probably ctrl-c from user */
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /*
+ * Decrease the pool size
+ * First return free pages to the buddy allocator (being careful
+ * to keep enough around to satisfy reservations). Then place
+ * pages into surplus state as needed so the pool will shrink
+ * to the desired size as pages become free.
+ *
+ * By placing pages into the surplus state independent of the
+ * overcommit value, we are allowing the surplus pool size to
+ * exceed overcommit. There are few sane options here. Since
+ * alloc_buddy_huge_page() is checking the global counter,
+ * though, we'll note that we're not allowed to exceed surplus
+ * and won't grow the pool anywhere else. Not until one of the
+ * sysctls are changed, or the surplus pages go out of use.
+ */
+ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ min_count = max(count, min_count);
+ try_to_free_low(h, min_count, nodes_allowed);
+ while (min_count < persistent_huge_pages(h)) {
+ if (!free_pool_huge_page(h, nodes_allowed, 0))
+ break;
+ }
+ while (count < persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, nodes_allowed, 1))
+ break;
+ }
+out:
+ ret = persistent_huge_pages(h);
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+#define HSTATE_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
+{
+ int i;
+
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = NUMA_NO_NODE;
+ return &hstates[i];
+ }
+
+ return kobj_to_node_hstate(kobj, nidp);
+}
+
+static ssize_t nr_hugepages_show_common(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long nr_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ nr_huge_pages = h->nr_huge_pages;
+ else
+ nr_huge_pages = h->nr_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", nr_huge_pages);
+}
+
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+ struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int err;
+ int nid;
+ unsigned long count;
+ struct hstate *h;
+ NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+
+ err = strict_strtoul(buf, 10, &count);
+ if (err)
+ goto out;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (h->order >= MAX_ORDER) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (nid == NUMA_NO_NODE) {
+ /*
+ * global hstate attribute
+ */
+ if (!(obey_mempolicy &&
+ init_nodemask_of_mempolicy(nodes_allowed))) {
+ NODEMASK_FREE(nodes_allowed);
+ nodes_allowed = &node_states[N_MEMORY];
+ }
+ } else if (nodes_allowed) {
+ /*
+ * per node hstate attribute: adjust count to global,
+ * but restrict alloc/free to the specified node.
+ */
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ init_nodemask_of_node(nodes_allowed, nid);
+ } else
+ nodes_allowed = &node_states[N_MEMORY];
+
+ h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+
+ if (nodes_allowed != &node_states[N_MEMORY])
+ NODEMASK_FREE(nodes_allowed);
+
+ return len;
+out:
+ NODEMASK_FREE(nodes_allowed);
+ return err;
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(false, kobj, attr, buf, len);
+}
+HSTATE_ATTR(nr_hugepages);
+
+#ifdef CONFIG_NUMA
+
+/*
+ * hstate attribute for optionally mempolicy-based constraint on persistent
+ * huge page alloc/free.
+ */
+static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(true, kobj, attr, buf, len);
+}
+HSTATE_ATTR(nr_hugepages_mempolicy);
+#endif
+
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+
+ if (h->order >= MAX_ORDER)
+ return -EINVAL;
+
+ err = strict_strtoul(buf, 10, &input);
+ if (err)
+ return err;
+
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = input;
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long free_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ free_huge_pages = h->free_huge_pages;
+ else
+ free_huge_pages = h->free_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long surplus_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ surplus_huge_pages = h->surplus_huge_pages;
+ else
+ surplus_huge_pages = h->surplus_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static struct attribute *hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &nr_overcommit_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &resv_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+#ifdef CONFIG_NUMA
+ &nr_hugepages_mempolicy_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hstate_attr_group = {
+ .attrs = hstate_attrs,
+};
+
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
+ struct kobject **hstate_kobjs,
+ struct attribute_group *hstate_attr_group)
+{
+ int retval;
+ int hi = hstate_index(h);
+
+ hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
+ if (!hstate_kobjs[hi])
+ return -ENOMEM;
+
+ retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
+ if (retval)
+ kobject_put(hstate_kobjs[hi]);
+
+ return retval;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("Hugetlb: Unable to add hstate %s", h->name);
+ }
+}
+
+#ifdef CONFIG_NUMA
+
+/*
+ * node_hstate/s - associate per node hstate attributes, via their kobjects,
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
+ * the base kernel, on the hugetlb module.
+ */
+struct node_hstate {
+ struct kobject *hugepages_kobj;
+ struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+};
+struct node_hstate node_hstates[MAX_NUMNODES];
+
+/*
+ * A subset of global hstate attributes for node devices
+ */
+static struct attribute *per_node_hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+ NULL,
+};
+
+static struct attribute_group per_node_hstate_attr_group = {
+ .attrs = per_node_hstate_attrs,
+};
+
+/*
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
+ * Returns node id via non-NULL nidp.
+ */
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++) {
+ struct node_hstate *nhs = &node_hstates[nid];
+ int i;
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (nhs->hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = nid;
+ return &hstates[i];
+ }
+ }
+
+ BUG();
+ return NULL;
+}
+
+/*
+ * Unregister hstate attributes from a single node device.
+ * No-op if no hstate attributes attached.
+ */
+void hugetlb_unregister_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+
+ if (!nhs->hugepages_kobj)
+ return; /* no hstate attributes */
+
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
+ }
+ }
+
+ kobject_put(nhs->hugepages_kobj);
+ nhs->hugepages_kobj = NULL;
+}
+
+/*
+ * hugetlb module exit: unregister hstate attributes from node devices
+ * that have them.
+ */
+static void hugetlb_unregister_all_nodes(void)
+{
+ int nid;
+
+ /*
+ * disable node device registrations.
+ */
+ register_hugetlbfs_with_node(NULL, NULL);
+
+ /*
+ * remove hstate attributes from any nodes that have them.
+ */
+ for (nid = 0; nid < nr_node_ids; nid++)
+ hugetlb_unregister_node(node_devices[nid]);
+}
+
+/*
+ * Register hstate attributes for a single node device.
+ * No-op if attributes already registered.
+ */
+void hugetlb_register_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+ int err;
+
+ if (nhs->hugepages_kobj)
+ return; /* already allocated */
+
+ nhs->hugepages_kobj = kobject_create_and_add("hugepages",
+ &node->dev.kobj);
+ if (!nhs->hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
+ nhs->hstate_kobjs,
+ &per_node_hstate_attr_group);
+ if (err) {
+ pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+ h->name, node->dev.id);
+ hugetlb_unregister_node(node);
+ break;
+ }
+ }
+}
+
+/*
+ * hugetlb init time: register hstate attributes for all registered node
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
+ */
+static void hugetlb_register_all_nodes(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct node *node = node_devices[nid];
+ if (node->dev.id == nid)
+ hugetlb_register_node(node);
+ }
+
+ /*
+ * Let the node device driver know we're here so it can
+ * [un]register hstate attributes on node hotplug.
+ */
+ register_hugetlbfs_with_node(hugetlb_register_node,
+ hugetlb_unregister_node);
+}
+#else /* !CONFIG_NUMA */
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ BUG();
+ if (nidp)
+ *nidp = -1;
+ return NULL;
+}
+
+static void hugetlb_unregister_all_nodes(void) { }
+
+static void hugetlb_register_all_nodes(void) { }
+
+#endif
+
+static void __exit hugetlb_exit(void)
+{
+ struct hstate *h;
+
+ hugetlb_unregister_all_nodes();
+
+ for_each_hstate(h) {
+ kobject_put(hstate_kobjs[hstate_index(h)]);
+ }
+
+ kobject_put(hugepages_kobj);
+}
+module_exit(hugetlb_exit);
+
+static int __init hugetlb_init(void)
+{
+ /* Some platform decide whether they support huge pages at boot
+ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+ * there is no such support
+ */
+ if (HPAGE_SHIFT == 0)
+ return 0;
+
+ if (!size_to_hstate(default_hstate_size)) {
+ default_hstate_size = HPAGE_SIZE;
+ if (!size_to_hstate(default_hstate_size))
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ }
+ default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
+ if (default_hstate_max_huge_pages)
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+
+ hugetlb_init_hstates();
+ gather_bootmem_prealloc();
+ report_hugepages();
+
+ hugetlb_sysfs_init();
+ hugetlb_register_all_nodes();
+ hugetlb_cgroup_file_init();
+
+ return 0;
+}
+module_init(hugetlb_init);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_add_hstate(unsigned order)
+{
+ struct hstate *h;
+ unsigned long i;
+
+ if (size_to_hstate(PAGE_SIZE << order)) {
+ pr_warning("hugepagesz= specified twice, ignoring\n");
+ return;
+ }
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(order == 0);
+ h = &hstates[hugetlb_max_hstate++];
+ h->order = order;
+ h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+ h->nr_huge_pages = 0;
+ h->free_huge_pages = 0;
+ for (i = 0; i < MAX_NUMNODES; ++i)
+ INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
+ h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+ snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+ huge_page_size(h)/1024);
+
+ parsed_hstate = h;
+}
+
+static int __init hugetlb_nrpages_setup(char *s)
+{
+ unsigned long *mhp;
+ static unsigned long *last_mhp;
+
+ /*
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * so this hugepages= parameter goes to the "default hstate".
+ */
+ if (!hugetlb_max_hstate)
+ mhp = &default_hstate_max_huge_pages;
+ else
+ mhp = &parsed_hstate->max_huge_pages;
+
+ if (mhp == last_mhp) {
+ pr_warning("hugepages= specified twice without "
+ "interleaving hugepagesz=, ignoring\n");
+ return 1;
+ }
+
+ if (sscanf(s, "%lu", mhp) <= 0)
+ *mhp = 0;
+
+ /*
+ * Global state is always initialized later in hugetlb_init.
+ * But we need to allocate >= MAX_ORDER hstates here early to still
+ * use the bootmem allocator.
+ */
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+ hugetlb_hstate_alloc_pages(parsed_hstate);
+
+ last_mhp = mhp;
+
+ return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+
+static int __init hugetlb_default_setup(char *s)
+{
+ default_hstate_size = memparse(s, &s);
+ return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+ int node;
+ unsigned int nr = 0;
+
+ for_each_node_mask(node, cpuset_current_mems_allowed)
+ nr += array[node];
+
+ return nr;
+}
+
+#ifdef CONFIG_SYSCTL
+static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+ int ret;
+
+ tmp = h->max_huge_pages;
+
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
+
+ if (write) {
+ NODEMASK_ALLOC(nodemask_t, nodes_allowed,
+ GFP_KERNEL | __GFP_NORETRY);
+ if (!(obey_mempolicy &&
+ init_nodemask_of_mempolicy(nodes_allowed))) {
+ NODEMASK_FREE(nodes_allowed);
+ nodes_allowed = &node_states[N_MEMORY];
+ }
+ h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
+
+ if (nodes_allowed != &node_states[N_MEMORY])
+ NODEMASK_FREE(nodes_allowed);
+ }
+out:
+ return ret;
+}
+
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+
+ return hugetlb_sysctl_handler_common(false, table, write,
+ buffer, length, ppos);
+}
+
+#ifdef CONFIG_NUMA
+int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ return hugetlb_sysctl_handler_common(true, table, write,
+ buffer, length, ppos);
+}
+#endif /* CONFIG_NUMA */
+
+int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, buffer, length, ppos);
+ if (hugepages_treat_as_movable)
+ htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
+ else
+ htlb_alloc_mask = GFP_HIGHUSER;
+ return 0;
+}
+
+int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+ int ret;
+
+ tmp = h->nr_overcommit_huge_pages;
+
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
+
+ if (write) {
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = tmp;
+ spin_unlock(&hugetlb_lock);
+ }
+out:
+ return ret;
+}
+
+#endif /* CONFIG_SYSCTL */
+
+void hugetlb_report_meminfo(struct seq_file *m)
+{
+ struct hstate *h = &default_hstate;
+ seq_printf(m,
+ "HugePages_Total: %5lu\n"
+ "HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
+ "HugePages_Surp: %5lu\n"
+ "Hugepagesize: %8lu kB\n",
+ h->nr_huge_pages,
+ h->free_huge_pages,
+ h->resv_huge_pages,
+ h->surplus_huge_pages,
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+}
+
+int hugetlb_report_node_meminfo(int nid, char *buf)
+{
+ struct hstate *h = &default_hstate;
+ return sprintf(buf,
+ "Node %d HugePages_Total: %5u\n"
+ "Node %d HugePages_Free: %5u\n"
+ "Node %d HugePages_Surp: %5u\n",
+ nid, h->nr_huge_pages_node[nid],
+ nid, h->free_huge_pages_node[nid],
+ nid, h->surplus_huge_pages_node[nid]);
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+ struct hstate *h = &default_hstate;
+ return h->nr_huge_pages * pages_per_huge_page(h);
+}
+
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+ int ret = -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+ /*
+ * When cpuset is configured, it breaks the strict hugetlb page
+ * reservation as the accounting is done on a global variable. Such
+ * reservation is completely rubbish in the presence of cpuset because
+ * the reservation is not checked against page availability for the
+ * current cpuset. Application can still potentially OOM'ed by kernel
+ * with lack of free htlb page in cpuset that the task is in.
+ * Attempt to enforce strict accounting with cpuset is almost
+ * impossible (or too ugly) because cpuset is too fluid that
+ * task or memory node can be dynamically moved between cpusets.
+ *
+ * The change of semantics for shared hugetlb mapping with cpuset is
+ * undesirable. However, in order to preserve some of the semantics,
+ * we fall back to check against current free page availability as
+ * a best attempt and hopefully to minimize the impact of changing
+ * semantics that cpuset has.
+ */
+ if (delta > 0) {
+ if (gather_surplus_pages(h, delta) < 0)
+ goto out;
+
+ if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ return_unused_surplus_pages(h, delta);
+ goto out;
+ }
+ }
+
+ ret = 0;
+ if (delta < 0)
+ return_unused_surplus_pages(h, (unsigned long) -delta);
+
+out:
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /*
+ * This new VMA should share its siblings reservation map if present.
+ * The VMA will only ever have a valid reservation map pointer where
+ * it is being copied for another still existing VMA. As that VMA
+ * has a reference to the reservation map it cannot disappear until
+ * after this open call completes. It is therefore safe to take a
+ * new reference here without additional locking.
+ */
+ if (reservations)
+ kref_get(&reservations->refs);
+}
+
+static void resv_map_put(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (!reservations)
+ return;
+ kref_put(&reservations->refs, resv_map_release);
+}
+
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct resv_map *reservations = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
+ unsigned long reserve;
+ unsigned long start;
+ unsigned long end;
+
+ if (reservations) {
+ start = vma_hugecache_offset(h, vma, vma->vm_start);
+ end = vma_hugecache_offset(h, vma, vma->vm_end);
+
+ reserve = (end - start) -
+ region_count(&reservations->regions, start, end);
+
+ resv_map_put(vma);
+
+ if (reserve) {
+ hugetlb_acct_memory(h, -reserve);
+ hugepage_subpool_put_pages(spool, reserve);
+ }
+ }
+}
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all. They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ BUG();
+ return 0;
+}
+
+const struct vm_operations_struct hugetlb_vm_ops = {
+ .fault = hugetlb_vm_op_fault,
+ .open = hugetlb_vm_op_open,
+ .close = hugetlb_vm_op_close,
+};
+
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
+{
+ pte_t entry;
+
+ if (writable) {
+ entry =
+ pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ } else {
+ entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ }
+ entry = pte_mkyoung(entry);
+ entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, vma, page, writable);
+
+ return entry;
+}
+
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
+ update_mmu_cache(vma, address, ptep);
+}
+
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pte_t *src_pte, *dst_pte, entry;
+ struct page *ptepage;
+ unsigned long addr;
+ int cow;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ src_pte = huge_pte_offset(src, addr);
+ if (!src_pte)
+ continue;
+ dst_pte = huge_pte_alloc(dst, addr, sz);
+ if (!dst_pte)
+ goto nomem;
+
+ /* If the pagetables are shared don't copy or take references */
+ if (dst_pte == src_pte)
+ continue;
+
+ spin_lock(&dst->page_table_lock);
+ spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
+ if (!huge_pte_none(huge_ptep_get(src_pte))) {
+ if (cow)
+ huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_ptep_get(src_pte);
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ page_dup_rmap(ptepage);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ }
+ spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
+ }
+ return 0;
+
+nomem:
+ return -ENOMEM;
+}
+
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp))
+ return 1;
+ else
+ return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+ return 1;
+ else
+ return 0;
+}
+
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
+{
+ int force_flush = 0;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *ptep;
+ pte_t pte;
+ struct page *page;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ const unsigned long mmun_start = start; /* For mmu_notifiers */
+ const unsigned long mmun_end = end; /* For mmu_notifiers */
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(start & ~huge_page_mask(h));
+ BUG_ON(end & ~huge_page_mask(h));
+
+ tlb_start_vma(tlb, vma);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+again:
+ spin_lock(&mm->page_table_lock);
+ for (address = start; address < end; address += sz) {
+ ptep = huge_pte_offset(mm, address);
+ if (!ptep)
+ continue;
+
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
+
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte))
+ continue;
+
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ pte_clear(mm, address, ptep);
+ continue;
+ }
+
+ page = pte_page(pte);
+ /*
+ * If a reference page is supplied, it is because a specific
+ * page is being unmapped, not a range. Ensure the page we
+ * are about to unmap is the actual page of interest.
+ */
+ if (ref_page) {
+ if (page != ref_page)
+ continue;
+
+ /*
+ * Mark the VMA as having unmapped its page so that
+ * future faults in this VMA will fail rather than
+ * looking like data was lost
+ */
+ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ }
+
+ pte = huge_ptep_get_and_clear(mm, address, ptep);
+ tlb_remove_tlb_entry(tlb, ptep, address);
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+
+ page_remove_rmap(page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
+ /* Bail out after unmapping reference page if supplied */
+ if (ref_page)
+ break;
+ }
+ spin_unlock(&mm->page_table_lock);
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (address < end && !ref_page)
+ goto again;
+ }
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ tlb_end_vma(tlb, vma);
+}
+
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex. This works
+ * because in the context this is called, the VMA is about to be
+ * destroyed and the i_mmap_mutex is held.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, 0);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, start, end);
+}
+
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page *page, unsigned long address)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct vm_area_struct *iter_vma;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+
+ /*
+ * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+ * from page cache lookup which is in HPAGE_SIZE units.
+ */
+ address = address & huge_page_mask(h);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ mapping = file_inode(vma->vm_file)->i_mapping;
+
+ /*
+ * Take the mapping lock for the duration of the table walk. As
+ * this mapping should be shared between all the VMAs,
+ * __unmap_hugepage_range() is called as the lock is already held
+ */
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
+ /* Do not unmap the current VMA */
+ if (iter_vma == vma)
+ continue;
+
+ /*
+ * Unmap the page from other VMAs without their own reserves.
+ * They get marked to be SIGKILLed if they fault in these
+ * areas. This is because a future no-page fault on this VMA
+ * could insert a zeroed page instead of the data existing
+ * from the time of fork. This would look like data corruption
+ */
+ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ return 1;
+}
+
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
+ */
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte,
+ struct page *pagecache_page)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct page *old_page, *new_page;
+ int avoidcopy;
+ int outside_reserve = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ old_page = pte_page(pte);
+
+retry_avoidcopy:
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_mapcount(old_page) == 1);
+ if (avoidcopy) {
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
+ set_huge_ptep_writable(vma, address, ptep);
+ return 0;
+ }
+
+ /*
+ * If the process that created a MAP_PRIVATE mapping is about to
+ * perform a COW due to a shared page count, attempt to satisfy
+ * the allocation without using the existing reserves. The pagecache
+ * page is used to determine if the reserve at this address was
+ * consumed or not. If reserves were used, a partial faulted mapping
+ * at the time of fork() could consume its reserves on COW instead
+ * of the full address range.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE) &&
+ is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ old_page != pagecache_page)
+ outside_reserve = 1;
+
+ page_cache_get(old_page);
+
+ /* Drop page_table_lock as buddy allocator may be called */
+ spin_unlock(&mm->page_table_lock);
+ new_page = alloc_huge_page(vma, address, outside_reserve);
+
+ if (IS_ERR(new_page)) {
+ long err = PTR_ERR(new_page);
+ page_cache_release(old_page);
+
+ /*
+ * If a process owning a MAP_PRIVATE mapping fails to COW,
+ * it is due to references held by a child and an insufficient
+ * huge page pool. To guarantee the original mappers
+ * reliability, unmap the page from child processes. The child
+ * may get SIGKILLed if it later faults.
+ */
+ if (outside_reserve) {
+ BUG_ON(huge_pte_none(pte));
+ if (unmap_ref_private(mm, vma, old_page, address)) {
+ BUG_ON(huge_pte_none(pte));
+ spin_lock(&mm->page_table_lock);
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page_table_lock, and
+ * our job is done.
+ */
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ }
+
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ else
+ return VM_FAULT_SIGBUS;
+ }
+
+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma))) {
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
+ return VM_FAULT_OOM;
+ }
+
+ copy_user_huge_page(new_page, old_page, address, vma,
+ pages_per_huge_page(h));
+ __SetPageUptodate(new_page);
+
+ mmun_start = address & huge_page_mask(h);
+ mmun_end = mmun_start + huge_page_size(h);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ /*
+ * Retake the page_table_lock to check for racing updates
+ * before the page tables are altered
+ */
+ spin_lock(&mm->page_table_lock);
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+ /* Break COW */
+ huge_ptep_clear_flush(vma, address, ptep);
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page);
+ hugepage_add_new_anon_rmap(new_page, vma, address);
+ /* Make the old page be freed below */
+ new_page = old_page;
+ }
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return 0;
+}
+
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ return find_lock_page(mapping, idx);
+}
+
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+ struct page *page;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ page = find_get_page(mapping, idx);
+ if (page)
+ put_page(page);
+ return page != NULL;
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int flags)
+{
+ struct hstate *h = hstate_vma(vma);
+ int ret = VM_FAULT_SIGBUS;
+ int anon_rmap = 0;
+ pgoff_t idx;
+ unsigned long size;
+ struct page *page;
+ struct address_space *mapping;
+ pte_t new_pte;
+
+ /*
+ * Currently, we are forced to kill the process in the event the
+ * original mapper has unmapped pages from the child due to a failed
+ * COW. Warn that such a situation has occurred as it may not be obvious
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+ pr_warning("PID %d killed due to inadequate hugepage pool\n",
+ current->pid);
+ return ret;
+ }
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ /*
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
+ */
+retry:
+ page = find_lock_page(mapping, idx);
+ if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+ page = alloc_huge_page(vma, address, 0);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ clear_huge_page(page, address, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+ int err;
+ struct inode *inode = mapping->host;
+
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ if (err == -EEXIST)
+ goto retry;
+ goto out;
+ }
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ } else {
+ lock_page(page);
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ anon_rmap = 1;
+ }
+ } else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ goto backout_unlocked;
+ }
+ }
+
+ /*
+ * If we are going to COW a private mapping later, we examine the
+ * pending reservations for this page now. This will ensure that
+ * any allocations necessary to record that reservation occur outside
+ * the spinlock.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto backout;
+
+ ret = 0;
+ if (!huge_pte_none(huge_ptep_get(ptep)))
+ goto backout;
+
+ if (anon_rmap)
+ hugepage_add_new_anon_rmap(page, vma, address);
+ else
+ page_dup_rmap(page);
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+out:
+ return ret;
+
+backout:
+ spin_unlock(&mm->page_table_lock);
+backout_unlocked:
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ pte_t *ptep;
+ pte_t entry;
+ int ret;
+ struct page *page = NULL;
+ struct page *pagecache_page = NULL;
+ static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+ struct hstate *h = hstate_vma(vma);
+
+ address &= huge_page_mask(h);
+
+ ptep = huge_pte_offset(mm, address);
+ if (ptep) {
+ entry = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait(mm, (pmd_t *)ptep, address);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ }
+
+ ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+ if (!ptep)
+ return VM_FAULT_OOM;
+
+ /*
+ * Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ mutex_lock(&hugetlb_instantiation_mutex);
+ entry = huge_ptep_get(ptep);
+ if (huge_pte_none(entry)) {
+ ret = hugetlb_no_page(mm, vma, address, ptep, flags);
+ goto out_mutex;
+ }
+
+ ret = 0;
+
+ /*
+ * If we are going to COW the mapping later, we examine the pending
+ * reservations for this page now. This will ensure that any
+ * allocations necessary to record that reservation occur outside the
+ * spinlock. For private mappings, we also lookup the pagecache
+ * page now as it is used to determine if a reservation has been
+ * consumed.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto out_mutex;
+ }
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ pagecache_page = hugetlbfs_pagecache_page(h,
+ vma, address);
+ }
+
+ /*
+ * hugetlb_cow() requires page locks of pte_page(entry) and
+ * pagecache_page, so here we need take the former one
+ * when page != pagecache_page or !pagecache_page.
+ * Note that locking order is always pagecache_page -> page,
+ * so no worry about deadlock.
+ */
+ page = pte_page(entry);
+ get_page(page);
+ if (page != pagecache_page)
+ lock_page(page);
+
+ spin_lock(&mm->page_table_lock);
+ /* Check for a racing update before calling hugetlb_cow */
+ if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ goto out_page_table_lock;
+
+
+ if (flags & FAULT_FLAG_WRITE) {
+ if (!pte_write(entry)) {
+ ret = hugetlb_cow(mm, vma, address, ptep, entry,
+ pagecache_page);
+ goto out_page_table_lock;
+ }
+ entry = pte_mkdirty(entry);
+ }
+ entry = pte_mkyoung(entry);
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry,
+ flags & FAULT_FLAG_WRITE))
+ update_mmu_cache(vma, address, ptep);
+
+out_page_table_lock:
+ spin_unlock(&mm->page_table_lock);
+
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+ if (page != pagecache_page)
+ unlock_page(page);
+ put_page(page);
+
+out_mutex:
+ mutex_unlock(&hugetlb_instantiation_mutex);
+
+ return ret;
+}
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, unsigned long *nr_pages,
+ long i, unsigned int flags)
+{
+ unsigned long pfn_offset;
+ unsigned long vaddr = *position;
+ unsigned long remainder = *nr_pages;
+ struct hstate *h = hstate_vma(vma);
+
+ spin_lock(&mm->page_table_lock);
+ while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ int absent;
+ struct page *page;
+
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make sure we get the
+ * first, for the page indexing below to work.
+ */
+ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+ absent = !pte || huge_pte_none(huge_ptep_get(pte));
+
+ /*
+ * When coredumping, it suits get_dump_page if we just return
+ * an error where there's an empty slot with no huge pagecache
+ * to back it. This way, we avoid allocating a hugepage, and
+ * the sparse dumpfile avoids allocating disk blocks, but its
+ * huge holes still show up with zeroes where they need to be.
+ */
+ if (absent && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+ remainder = 0;
+ break;
+ }
+
+ if (absent ||
+ ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
+ int ret;
+
+ spin_unlock(&mm->page_table_lock);
+ ret = hugetlb_fault(mm, vma, vaddr,
+ (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
+ spin_lock(&mm->page_table_lock);
+ if (!(ret & VM_FAULT_ERROR))
+ continue;
+
+ remainder = 0;
+ break;
+ }
+
+ pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
+ page = pte_page(huge_ptep_get(pte));
+same_page:
+ if (pages) {
+ pages[i] = mem_map_offset(page, pfn_offset);
+ get_page(pages[i]);
+ }
+
+ if (vmas)
+ vmas[i] = vma;
+
+ vaddr += PAGE_SIZE;
+ ++pfn_offset;
+ --remainder;
+ ++i;
+ if (vaddr < vma->vm_end && remainder &&
+ pfn_offset < pages_per_huge_page(h)) {
+ /*
+ * We use pfn_offset to avoid touching the pageframes
+ * of this compound page.
+ */
+ goto same_page;
+ }
+ }
+ spin_unlock(&mm->page_table_lock);
+ *nr_pages = remainder;
+ *position = vaddr;
+
+ return i ? i : -EFAULT;
+}
+
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long start = address;
+ pte_t *ptep;
+ pte_t pte;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
+
+ BUG_ON(address >= end);
+ flush_cache_range(vma, address, end);
+
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ spin_lock(&mm->page_table_lock);
+ for (; address < end; address += huge_page_size(h)) {
+ ptep = huge_pte_offset(mm, address);
+ if (!ptep)
+ continue;
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
+ continue;
+ }
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
+ pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = pte_mkhuge(pte_modify(pte, newprot));
+ pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ set_huge_pte_at(mm, address, ptep, pte);
+ pages++;
+ }
+ }
+ spin_unlock(&mm->page_table_lock);
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
+ flush_tlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+
+ return pages << h->order;
+}
+
+int hugetlb_reserve_pages(struct inode *inode,
+ long from, long to,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+ long ret, chg;
+ struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
+
+ /*
+ * Only apply hugepage reservation if asked. At fault time, an
+ * attempt will be made for VM_NORESERVE to allocate a page
+ * without using reserves
+ */
+ if (vm_flags & VM_NORESERVE)
+ return 0;
+
+ /*
+ * Shared mappings base their reservation on the number of pages that
+ * are already allocated on behalf of the file. Private mappings need
+ * to reserve the full area even if read-only as mprotect() may be
+ * called to make the mapping read-write. Assume !vma is a shm mapping
+ */
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ chg = region_chg(&inode->i_mapping->private_list, from, to);
+ else {
+ struct resv_map *resv_map = resv_map_alloc();
+ if (!resv_map)
+ return -ENOMEM;
+
+ chg = to - from;
+
+ set_vma_resv_map(vma, resv_map);
+ set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+ }
+
+ if (chg < 0) {
+ ret = chg;
+ goto out_err;
+ }
+
+ /* There must be enough pages in the subpool for the mapping */
+ if (hugepage_subpool_get_pages(spool, chg)) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
+
+ /*
+ * Check enough hugepages are available for the reservation.
+ * Hand the pages back to the subpool if there are not
+ */
+ ret = hugetlb_acct_memory(h, chg);
+ if (ret < 0) {
+ hugepage_subpool_put_pages(spool, chg);
+ goto out_err;
+ }
+
+ /*
+ * Account for the reservations made. Shared mappings record regions
+ * that have reservations as they are shared by multiple VMAs.
+ * When the last VMA disappears, the region map says how much
+ * the reservation was and the page cache tells how much of
+ * the reservation was consumed. Private mappings are per-VMA and
+ * only the consumed reservations are tracked. When the VMA
+ * disappears, the original reservation is the VMA size and the
+ * consumed reservations are stored in the map. Hence, nothing
+ * else has to be done for private mappings here
+ */
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ region_add(&inode->i_mapping->private_list, from, to);
+ return 0;
+out_err:
+ if (vma)
+ resv_map_put(vma);
+ return ret;
+}
+
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+ struct hstate *h = hstate_inode(inode);
+ long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ struct hugepage_subpool *spool = subpool_inode(inode);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks -= (blocks_per_huge_page(h) * freed);
+ spin_unlock(&inode->i_lock);
+
+ hugepage_subpool_put_pages(spool, (chg - freed));
+ hugetlb_acct_memory(h, -(chg - freed));
+}
+
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+ if (page == hpage)
+ return 1;
+ return 0;
+}
+
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
+{
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+ int ret = -EBUSY;
+
+ spin_lock(&hugetlb_lock);
+ if (is_hugepage_on_freelist(hpage)) {
+ /*
+ * Hwpoisoned hugepage isn't linked to activelist or freelist,
+ * but dangling hpage->lru can trigger list-debug warnings
+ * (this happens when we call unpoison_memory() on it),
+ * so let it point to itself with list_del_init().
+ */
+ list_del_init(&hpage->lru);
+ set_page_refcounted(hpage);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ ret = 0;
+ }
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+#endif
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 00000000..9cea7de2
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,426 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+
+struct hugetlb_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the counter to account for hugepages from hugetlb.
+ */
+ struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+struct cgroup_subsys hugetlb_subsys __read_mostly;
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+ return container_of(s, struct hugetlb_cgroup, css);
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+ hugetlb_subsys_id));
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+ return hugetlb_cgroup_from_css(task_subsys_state(task,
+ hugetlb_subsys_id));
+}
+
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+ return (h_cg == root_h_cgroup);
+}
+
+static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
+{
+ if (!cg->parent)
+ return NULL;
+ return hugetlb_cgroup_from_cgroup(cg->parent);
+}
+
+static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
+{
+ int idx;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
+
+ for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+ return true;
+ }
+ return false;
+}
+
+static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
+{
+ int idx;
+ struct cgroup *parent_cgroup;
+ struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
+
+ h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+ if (!h_cgroup)
+ return ERR_PTR(-ENOMEM);
+
+ parent_cgroup = cgroup->parent;
+ if (parent_cgroup) {
+ parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+ res_counter_init(&h_cgroup->hugepage[idx],
+ &parent_h_cgroup->hugepage[idx]);
+ } else {
+ root_h_cgroup = h_cgroup;
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+ res_counter_init(&h_cgroup->hugepage[idx], NULL);
+ }
+ return &h_cgroup->css;
+}
+
+static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
+{
+ struct hugetlb_cgroup *h_cgroup;
+
+ h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
+ kfree(h_cgroup);
+}
+
+
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
+ struct page *page)
+{
+ int csize;
+ struct res_counter *counter;
+ struct res_counter *fail_res;
+ struct hugetlb_cgroup *page_hcg;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+ struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
+
+ page_hcg = hugetlb_cgroup_from_page(page);
+ /*
+ * We can have pages in active list without any cgroup
+ * ie, hugepage with less than 3 pages. We can safely
+ * ignore those pages.
+ */
+ if (!page_hcg || page_hcg != h_cg)
+ goto out;
+
+ csize = PAGE_SIZE << compound_order(page);
+ if (!parent) {
+ parent = root_h_cgroup;
+ /* root has no limit */
+ res_counter_charge_nofail(&parent->hugepage[idx],
+ csize, &fail_res);
+ }
+ counter = &h_cg->hugepage[idx];
+ res_counter_uncharge_until(counter, counter->parent, csize);
+
+ set_hugetlb_cgroup(page, parent);
+out:
+ return;
+}
+
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
+{
+ struct hstate *h;
+ struct page *page;
+ int idx = 0;
+
+ do {
+ for_each_hstate(h) {
+ spin_lock(&hugetlb_lock);
+ list_for_each_entry(page, &h->hugepage_activelist, lru)
+ hugetlb_cgroup_move_parent(idx, cgroup, page);
+
+ spin_unlock(&hugetlb_lock);
+ idx++;
+ }
+ cond_resched();
+ } while (hugetlb_cgroup_have_usage(cgroup));
+}
+
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr)
+{
+ int ret = 0;
+ struct res_counter *fail_res;
+ struct hugetlb_cgroup *h_cg = NULL;
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled())
+ goto done;
+ /*
+ * We don't charge any cgroup if the compound page have less
+ * than 3 pages.
+ */
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ goto done;
+again:
+ rcu_read_lock();
+ h_cg = hugetlb_cgroup_from_task(current);
+ if (!css_tryget(&h_cg->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+ css_put(&h_cg->css);
+done:
+ *ptr = h_cg;
+ return ret;
+}
+
+/* Should be called with hugetlb_lock held */
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ set_hugetlb_cgroup(page, h_cg);
+ return;
+}
+
+/*
+ * Should be called with hugetlb_lock held
+ */
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page)
+{
+ struct hugetlb_cgroup *h_cg;
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled())
+ return;
+ VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+ h_cg = hugetlb_cgroup_from_page(page);
+ if (unlikely(!h_cg))
+ return;
+ set_hugetlb_cgroup(page, NULL);
+ res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ return;
+}
+
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg)
+{
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ return;
+
+ res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ return;
+}
+
+static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
+ struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ u64 val;
+ char str[64];
+ int idx, name, len;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+
+ idx = MEMFILE_IDX(cft->private);
+ name = MEMFILE_ATTR(cft->private);
+
+ val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+ len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+ return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
+ const char *buffer)
+{
+ int idx, name, ret;
+ unsigned long long val;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+
+ idx = MEMFILE_IDX(cft->private);
+ name = MEMFILE_ATTR(cft->private);
+
+ switch (name) {
+ case RES_LIMIT:
+ if (hugetlb_cgroup_is_root(h_cg)) {
+ /* Can't set limit on root */
+ ret = -EINVAL;
+ break;
+ }
+ /* This function does all necessary parse...reuse it */
+ ret = res_counter_memparse_write_strategy(buffer, &val);
+ if (ret)
+ break;
+ ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
+{
+ int idx, name, ret = 0;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+
+ idx = MEMFILE_IDX(event);
+ name = MEMFILE_ATTR(event);
+
+ switch (name) {
+ case RES_MAX_USAGE:
+ res_counter_reset_max(&h_cg->hugepage[idx]);
+ break;
+ case RES_FAILCNT:
+ res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+ if (hsize >= (1UL << 30))
+ snprintf(buf, size, "%luGB", hsize >> 30);
+ else if (hsize >= (1UL << 20))
+ snprintf(buf, size, "%luMB", hsize >> 20);
+ else
+ snprintf(buf, size, "%luKB", hsize >> 10);
+ return buf;
+}
+
+static void __init __hugetlb_cgroup_file_init(int idx)
+{
+ char buf[32];
+ struct cftype *cft;
+ struct hstate *h = &hstates[idx];
+
+ /* format the size */
+ mem_fmt(buf, 32, huge_page_size(h));
+
+ /* Add the limit file */
+ cft = &h->cgroup_files[0];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+ cft->read = hugetlb_cgroup_read;
+ cft->write_string = hugetlb_cgroup_write;
+
+ /* Add the usage file */
+ cft = &h->cgroup_files[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+ cft->read = hugetlb_cgroup_read;
+
+ /* Add the MAX usage file */
+ cft = &h->cgroup_files[2];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+ cft->trigger = hugetlb_cgroup_reset;
+ cft->read = hugetlb_cgroup_read;
+
+ /* Add the failcntfile */
+ cft = &h->cgroup_files[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+ cft->trigger = hugetlb_cgroup_reset;
+ cft->read = hugetlb_cgroup_read;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files[4];
+ memset(cft, 0, sizeof(*cft));
+
+ WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+
+ return;
+}
+
+void __init hugetlb_cgroup_file_init(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgroup details.
+ */
+ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+ __hugetlb_cgroup_file_init(hstate_index(h));
+ }
+}
+
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+ struct hugetlb_cgroup *h_cg;
+ struct hstate *h = page_hstate(oldhpage);
+
+ if (hugetlb_cgroup_disabled())
+ return;
+
+ VM_BUG_ON(!PageHuge(oldhpage));
+ spin_lock(&hugetlb_lock);
+ h_cg = hugetlb_cgroup_from_page(oldhpage);
+ set_hugetlb_cgroup(oldhpage, NULL);
+
+ /* move the h_cg details to new cgroup */
+ set_hugetlb_cgroup(newhpage, h_cg);
+ list_move(&newhpage->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ return;
+}
+
+struct cgroup_subsys hugetlb_subsys = {
+ .name = "hugetlb",
+ .css_alloc = hugetlb_cgroup_css_alloc,
+ .css_offline = hugetlb_cgroup_css_offline,
+ .css_free = hugetlb_cgroup_css_free,
+ .subsys_id = hugetlb_subsys_id,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 00000000..3a61efc5
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,141 @@
+/* Inject a hwpoison memory failure on a arbitrary pfn */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+#include "internal.h"
+
+static struct dentry *hwpoison_dir;
+
+static int hwpoison_inject(void *data, u64 val)
+{
+ unsigned long pfn = val;
+ struct page *p;
+ struct page *hpage;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!hwpoison_filter_enable)
+ goto inject;
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+
+ p = pfn_to_page(pfn);
+ hpage = compound_head(p);
+ /*
+ * This implies unable to support free buddy pages.
+ */
+ if (!get_page_unless_zero(hpage))
+ return 0;
+
+ if (!PageLRU(p) && !PageHuge(p))
+ shake_page(p, 0);
+ /*
+ * This implies unable to support non-LRU pages.
+ */
+ if (!PageLRU(p) && !PageHuge(p))
+ return 0;
+
+ /*
+ * do a racy check with elevated page count, to make sure PG_hwpoison
+ * will only be set for the targeted owner (or on a free page).
+ * We temporarily take page lock for try_get_mem_cgroup_from_page().
+ * memory_failure() will redo the check reliably inside page lock.
+ */
+ lock_page(hpage);
+ err = hwpoison_filter(hpage);
+ unlock_page(hpage);
+ if (err)
+ return 0;
+
+inject:
+ printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+ return memory_failure(pfn, 18, MF_COUNT_INCREASED);
+}
+
+static int hwpoison_unpoison(void *data, u64 val)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return unpoison_memory(val);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
+
+static void pfn_inject_exit(void)
+{
+ if (hwpoison_dir)
+ debugfs_remove_recursive(hwpoison_dir);
+}
+
+static int pfn_inject_init(void)
+{
+ struct dentry *dentry;
+
+ hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
+ if (hwpoison_dir == NULL)
+ return -ENOMEM;
+
+ /*
+ * Note that the below poison/unpoison interfaces do not involve
+ * hardware status change, hence do not require hardware support.
+ * They are mainly for testing hwpoison in software level.
+ */
+ dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+ NULL, &hwpoison_fops);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
+ NULL, &unpoison_fops);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
+ hwpoison_dir, &hwpoison_filter_enable);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
+ hwpoison_dir, &hwpoison_filter_dev_major);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
+ hwpoison_dir, &hwpoison_filter_dev_minor);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
+ hwpoison_dir, &hwpoison_filter_flags_mask);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
+ hwpoison_dir, &hwpoison_filter_flags_value);
+ if (!dentry)
+ goto fail;
+
+#ifdef CONFIG_MEMCG_SWAP
+ dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
+ hwpoison_dir, &hwpoison_filter_memcg);
+ if (!dentry)
+ goto fail;
+#endif
+
+ return 0;
+fail:
+ pfn_inject_exit();
+ return -ENOMEM;
+}
+
+module_init(pfn_inject_init);
+module_exit(pfn_inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 00000000..a56a8519
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,25 @@
+#include <linux/mm_types.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpumask.h>
+
+#include <linux/atomic.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
+
+struct mm_struct init_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ INIT_MM_CONTEXT(init_mm)
+};
diff --git a/mm/internal.h b/mm/internal.h
new file mode 100644
index 00000000..8562de0a
--- /dev/null
+++ b/mm/internal.h
@@ -0,0 +1,377 @@
+/* internal.h: mm/ internal definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
+
+#include <linux/mm.h>
+
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long floor, unsigned long ceiling);
+
+static inline void set_page_count(struct page *page, int v)
+{
+ atomic_set(&page->_count, v);
+}
+
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
+{
+ VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(atomic_read(&page->_count));
+ set_page_count(page, 1);
+}
+
+static inline void __put_page(struct page *page)
+{
+ atomic_dec(&page->_count);
+}
+
+static inline void __get_page_tail_foll(struct page *page,
+ bool get_page_head)
+{
+ /*
+ * If we're getting a tail page, the elevated page->_count is
+ * required only in the head page and we will elevate the head
+ * page->_count and tail page->_mapcount.
+ *
+ * We elevate page_tail->_mapcount for tail pages to force
+ * page_tail->_count to be zero at all times to avoid getting
+ * false positives from get_page_unless_zero() with
+ * speculative page access (like in
+ * page_cache_get_speculative()) on tail pages.
+ */
+ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ if (get_page_head)
+ atomic_inc(&page->first_page->_count);
+ atomic_inc(&page->_mapcount);
+}
+
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ /*
+ * This is safe only because
+ * __split_huge_page_refcount() can't run under
+ * get_page_foll() because we hold the proper PT lock.
+ */
+ __get_page_tail_foll(page, true);
+ else {
+ /*
+ * Getting a normal page or the head of a compound page
+ * requires to already have an elevated page->_count.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_inc(&page->_count);
+ }
+}
+
+extern unsigned long highest_memmap_pfn;
+
+/*
+ * in mm/vmscan.c:
+ */
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+
+/*
+ * in mm/rmap.c:
+ */
+extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+
+/*
+ * in mm/page_alloc.c
+ */
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
+extern void prep_compound_page(struct page *page, unsigned long order);
+#ifdef CONFIG_MEMORY_FAILURE
+extern bool is_free_buddy_page(struct page *page);
+#endif
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+
+/*
+ * in mm/compaction.c
+ */
+/*
+ * compact_control is used to track pages being migrated and the free pages
+ * they are being migrated to during memory compaction. The free_pfn starts
+ * at the end of a zone and migrate_pfn begins at the start. Movable pages
+ * are moved to the end of a zone during a compaction run and the run
+ * completes when free_pfn <= migrate_pfn
+ */
+struct compact_control {
+ struct list_head freepages; /* List of free pages to migrate to */
+ struct list_head migratepages; /* List of pages being migrated */
+ unsigned long nr_freepages; /* Number of isolated free pages */
+ unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ bool sync; /* Synchronous migration */
+ bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool finished_update_free; /* True when the zone cached pfns are
+ * no longer being updated
+ */
+ bool finished_update_migrate;
+
+ int order; /* order a direct compactor needs */
+ int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ struct zone *zone;
+ bool contended; /* True if a lock was contended */
+};
+
+unsigned long
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn);
+unsigned long
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
+
+#endif
+
+/*
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
+ */
+static inline unsigned long page_order(struct page *page)
+{
+ /* PageBuddy() must be checked by the caller */
+ return page_private(page);
+}
+
+/* mm/util.c */
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent);
+
+#ifdef CONFIG_MMU
+extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking);
+extern void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+ munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
+
+/*
+ * Called only in fault path, to determine if a new page is being
+ * mapped into a LOCKED vma. If it is, mark page as mlocked.
+ */
+static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
+ struct page *page)
+{
+ VM_BUG_ON(PageLRU(page));
+
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+ return 0;
+
+ if (!TestSetPageMlocked(page)) {
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ }
+ return 1;
+}
+
+/*
+ * must be called with vma's mmap_sem held for read or write, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+extern unsigned int munlock_vma_page(struct page *page);
+
+/*
+ * Clear the page's PageMlocked(). This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache -- e.g.,
+ * on truncation or freeing.
+ *
+ * It is legal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+extern void clear_page_mlock(struct page *page);
+
+/*
+ * mlock_migrate_page - called only from migrate_page_copy() to
+ * migrate the Mlocked page flag; update statistics.
+ */
+static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+{
+ if (TestClearPageMlocked(page)) {
+ unsigned long flags;
+ int nr_pages = hpage_nr_pages(page);
+
+ local_irq_save(flags);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ SetPageMlocked(newpage);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
+ local_irq_restore(flags);
+ }
+}
+
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+ struct vm_area_struct *vma);
+#endif
+#else /* !CONFIG_MMU */
+static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
+{
+ return 0;
+}
+static inline void clear_page_mlock(struct page *page) { }
+static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+
+#endif /* !CONFIG_MMU */
+
+/*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'. Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
+{
+ if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+ return pfn_to_page(page_to_pfn(base) + offset);
+ return base + offset;
+}
+
+/*
+ * Iterator over all subpages within the maximally aligned gigantic
+ * page 'base'. Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+ struct page *base, int offset)
+{
+ if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+ unsigned long pfn = page_to_pfn(base) + offset;
+ if (!pfn_valid(pfn))
+ return NULL;
+ return pfn_to_page(pfn);
+ }
+ return iter + 1;
+}
+
+/*
+ * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
+ * so all functions starting at paging_init should be marked __init
+ * in those cases. SPARSEMEM, however, allows for memory hotplug,
+ * and alloc_bootmem_node is not used.
+ */
+#ifdef CONFIG_SPARSEMEM
+#define __paginginit __meminit
+#else
+#define __paginginit __init
+#endif
+
+/* Memory initialisation debug and verification */
+enum mminit_level {
+ MMINIT_WARNING,
+ MMINIT_VERIFY,
+ MMINIT_TRACE
+};
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+
+extern int mminit_loglevel;
+
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+ if (level < mminit_loglevel) { \
+ printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+ printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+ } \
+} while (0)
+
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
+
+#else
+
+static inline void mminit_dprintk(enum mminit_level level,
+ const char *prefix, const char *fmt, ...)
+{
+}
+
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+
+static inline void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
+
+#define ZONE_RECLAIM_NOSCAN -2
+#define ZONE_RECLAIM_FULL -1
+#define ZONE_RECLAIM_SOME 0
+#define ZONE_RECLAIM_SUCCESS 1
+
+extern int hwpoison_filter(struct page *p);
+
+extern u32 hwpoison_filter_dev_major;
+extern u32 hwpoison_filter_dev_minor;
+extern u64 hwpoison_filter_flags_mask;
+extern u64 hwpoison_filter_flags_value;
+extern u64 hwpoison_filter_memcg;
+extern u32 hwpoison_filter_enable;
+
+extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
+extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN WMARK_MIN
+#define ALLOC_WMARK_LOW WMARK_LOW
+#define ALLOC_WMARK_HIGH WMARK_HIGH
+#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+
+#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 00000000..4a5822a5
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff;
+}
+
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+ unsigned long, shared.linear.rb_subtree_last,
+ vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+ struct vm_area_struct *prev,
+ struct rb_root *root)
+{
+ struct rb_node **link;
+ struct vm_area_struct *parent;
+ unsigned long last = vma_last_pgoff(node);
+
+ VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+
+ if (!prev->shared.linear.rb.rb_right) {
+ parent = prev;
+ link = &prev->shared.linear.rb.rb_right;
+ } else {
+ parent = rb_entry(prev->shared.linear.rb.rb_right,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ while (parent->shared.linear.rb.rb_left) {
+ parent = rb_entry(parent->shared.linear.rb.rb_left,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ }
+ link = &parent->shared.linear.rb.rb_left;
+ }
+
+ node->shared.linear.rb_subtree_last = last;
+ rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+ rb_insert_augmented(&node->shared.linear.rb, root,
+ &vma_interval_tree_augment);
+}
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+ avc_start_pgoff, avc_last_pgoff,
+ static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+ node->cached_vma_start = avc_start_pgoff(node);
+ node->cached_vma_last = avc_last_pgoff(node);
+#endif
+ __anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+ __anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+ WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+ WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 00000000..fd814fd6
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
+#include <linux/gfp.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/kmemcheck.h>
+
+void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
+{
+ struct page *shadow;
+ int pages;
+ int i;
+
+ pages = 1 << order;
+
+ /*
+ * With kmemcheck enabled, we need to allocate a memory area for the
+ * shadow bits as well.
+ */
+ shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
+ if (!shadow) {
+ if (printk_ratelimit())
+ printk(KERN_ERR "kmemcheck: failed to allocate "
+ "shadow bitmap\n");
+ return;
+ }
+
+ for(i = 0; i < pages; ++i)
+ page[i].shadow = page_address(&shadow[i]);
+
+ /*
+ * Mark it as non-present for the MMU so that our accesses to
+ * this memory will trigger a page fault and let us analyze
+ * the memory accesses.
+ */
+ kmemcheck_hide_pages(page, pages);
+}
+
+void kmemcheck_free_shadow(struct page *page, int order)
+{
+ struct page *shadow;
+ int pages;
+ int i;
+
+ if (!kmemcheck_page_is_tracked(page))
+ return;
+
+ pages = 1 << order;
+
+ kmemcheck_show_pages(page, pages);
+
+ shadow = virt_to_page(page[0].shadow);
+
+ for(i = 0; i < pages; ++i)
+ page[i].shadow = NULL;
+
+ __free_pages(shadow, order);
+}
+
+void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+ size_t size)
+{
+ /*
+ * Has already been memset(), which initializes the shadow for us
+ * as well.
+ */
+ if (gfpflags & __GFP_ZERO)
+ return;
+
+ /* No need to initialize the shadow of a non-tracked slab. */
+ if (s->flags & SLAB_NOTRACK)
+ return;
+
+ if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
+ /*
+ * Allow notracked objects to be allocated from
+ * tracked caches. Note however that these objects
+ * will still get page faults on access, they just
+ * won't ever be flagged as uninitialized. If page
+ * faults are not acceptable, the slab cache itself
+ * should be marked NOTRACK.
+ */
+ kmemcheck_mark_initialized(object, size);
+ } else if (!s->ctor) {
+ /*
+ * New objects should be marked uninitialized before
+ * they're returned to the called.
+ */
+ kmemcheck_mark_uninitialized(object, size);
+ }
+}
+
+void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
+{
+ /* TODO: RCU freeing is unsupported for now; hide false positives. */
+ if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
+ kmemcheck_mark_freed(object, size);
+}
+
+void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
+ gfp_t gfpflags)
+{
+ int pages;
+
+ if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
+ return;
+
+ pages = 1 << order;
+
+ /*
+ * NOTE: We choose to track GFP_ZERO pages too; in fact, they
+ * can become uninitialized by copying uninitialized memory
+ * into them.
+ */
+
+ /* XXX: Can use zone->node for node? */
+ kmemcheck_alloc_shadow(page, order, gfpflags, -1);
+
+ if (gfpflags & __GFP_ZERO)
+ kmemcheck_mark_initialized_pages(page, pages);
+ else
+ kmemcheck_mark_uninitialized_pages(page, pages);
+}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 00000000..ff0d9779
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,109 @@
+/*
+ * mm/kmemleak-test.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/fdtable.h>
+
+#include <linux/kmemleak.h>
+
+struct test_node {
+ long header[25];
+ struct list_head list;
+ long footer[25];
+};
+
+static LIST_HEAD(test_list);
+static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
+
+/*
+ * Some very simple testing. This function needs to be extended for
+ * proper testing.
+ */
+static int __init kmemleak_test_init(void)
+{
+ struct test_node *elem;
+ int i;
+
+ printk(KERN_INFO "Kmemleak testing\n");
+
+ /* make some orphan objects */
+ pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+#ifndef CONFIG_MODULES
+ pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+ kmem_cache_alloc(files_cachep, GFP_KERNEL));
+ pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+ kmem_cache_alloc(files_cachep, GFP_KERNEL));
+#endif
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+
+ /*
+ * Add elements to a list. They should only appear as orphan
+ * after the module is removed.
+ */
+ for (i = 0; i < 10; i++) {
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL);
+ pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
+ if (!elem)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&elem->list);
+ list_add_tail(&elem->list, &test_list);
+ }
+
+ for_each_possible_cpu(i) {
+ per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
+ pr_info("kmemleak: kmalloc(129) = %p\n",
+ per_cpu(kmemleak_test_pointer, i));
+ }
+
+ return 0;
+}
+module_init(kmemleak_test_init);
+
+static void __exit kmemleak_test_exit(void)
+{
+ struct test_node *elem, *tmp;
+
+ /*
+ * Remove the list elements without actually freeing the
+ * memory.
+ */
+ list_for_each_entry_safe(elem, tmp, &test_list, list)
+ list_del(&elem->list);
+}
+module_exit(kmemleak_test_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 00000000..c8d7f311
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1866 @@
+/*
+ * mm/kmemleak.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * For more information on the algorithm and kmemleak usage, please see
+ * Documentation/kmemleak.txt.
+ *
+ * Notes on locking
+ * ----------------
+ *
+ * The following locks and mutexes are used by kmemleak:
+ *
+ * - kmemleak_lock (rwlock): protects the object_list modifications and
+ * accesses to the object_tree_root. The object_list is the main list
+ * holding the metadata (struct kmemleak_object) for the allocated memory
+ * blocks. The object_tree_root is a red black tree used to look-up
+ * metadata based on a pointer to the corresponding memory block. The
+ * kmemleak_object structures are added to the object_list and
+ * object_tree_root in the create_object() function called from the
+ * kmemleak_alloc() callback and removed in delete_object() called from the
+ * kmemleak_free() callback
+ * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
+ * the metadata (e.g. count) are protected by this lock. Note that some
+ * members of this structure may be protected by other means (atomic or
+ * kmemleak_lock). This lock is also held when scanning the corresponding
+ * memory block to avoid the kernel freeing it via the kmemleak_free()
+ * callback. This is less heavyweight than holding a global lock like
+ * kmemleak_lock during scanning
+ * - scan_mutex (mutex): ensures that only one thread may scan the memory for
+ * unreferenced objects at a time. The gray_list contains the objects which
+ * are already referenced or marked as false positives and need to be
+ * scanned. This list is only modified during a scanning episode when the
+ * scan_mutex is held. At the end of a scan, the gray_list is always empty.
+ * Note that the kmemleak_object.use_count is incremented when an object is
+ * added to the gray_list and therefore cannot be freed. This mutex also
+ * prevents multiple users of the "kmemleak" debugfs file together with
+ * modifications to the memory scanning parameters including the scan_thread
+ * pointer
+ *
+ * The kmemleak_object structures have a use_count incremented or decremented
+ * using the get_object()/put_object() functions. When the use_count becomes
+ * 0, this count can no longer be incremented and put_object() schedules the
+ * kmemleak_object freeing via an RCU callback. All calls to the get_object()
+ * function must be protected by rcu_read_lock() to avoid accessing a freed
+ * structure.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/kthread.h>
+#include <linux/rbtree.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/stacktrace.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/err.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/nodemask.h>
+#include <linux/mm.h>
+#include <linux/workqueue.h>
+#include <linux/crc32.h>
+
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <linux/atomic.h>
+
+#include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
+#include <linux/memory_hotplug.h>
+
+/*
+ * Kmemleak configuration and common defines.
+ */
+#define MAX_TRACE 16 /* stack trace length */
+#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
+#define SECS_FIRST_SCAN 60 /* delay before the first scan */
+#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
+#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
+
+#define BYTES_PER_POINTER sizeof(void *)
+
+/* GFP bitmask for kmemleak internal allocations */
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+ __GFP_NORETRY | __GFP_NOMEMALLOC | \
+ __GFP_NOWARN)
+
+/* scanning area inside a memory block */
+struct kmemleak_scan_area {
+ struct hlist_node node;
+ unsigned long start;
+ size_t size;
+};
+
+#define KMEMLEAK_GREY 0
+#define KMEMLEAK_BLACK -1
+
+/*
+ * Structure holding the metadata for each allocated memory block.
+ * Modifications to such objects should be made while holding the
+ * object->lock. Insertions or deletions from object_list, gray_list or
+ * rb_node are already protected by the corresponding locks or mutex (see
+ * the notes on locking above). These objects are reference-counted
+ * (use_count) and freed using the RCU mechanism.
+ */
+struct kmemleak_object {
+ spinlock_t lock;
+ unsigned long flags; /* object status flags */
+ struct list_head object_list;
+ struct list_head gray_list;
+ struct rb_node rb_node;
+ struct rcu_head rcu; /* object_list lockless traversal */
+ /* object usage count; object freed when use_count == 0 */
+ atomic_t use_count;
+ unsigned long pointer;
+ size_t size;
+ /* minimum number of a pointers found before it is considered leak */
+ int min_count;
+ /* the total number of pointers found pointing to this object */
+ int count;
+ /* checksum for detecting modified objects */
+ u32 checksum;
+ /* memory ranges to be scanned inside an object (empty for all) */
+ struct hlist_head area_list;
+ unsigned long trace[MAX_TRACE];
+ unsigned int trace_len;
+ unsigned long jiffies; /* creation timestamp */
+ pid_t pid; /* pid of the current task */
+ char comm[TASK_COMM_LEN]; /* executable name */
+};
+
+/* flag representing the memory block allocation status */
+#define OBJECT_ALLOCATED (1 << 0)
+/* flag set after the first reporting of an unreference object */
+#define OBJECT_REPORTED (1 << 1)
+/* flag set to not scan the object */
+#define OBJECT_NO_SCAN (1 << 2)
+
+/* number of bytes to print per line; must be 16 or 32 */
+#define HEX_ROW_SIZE 16
+/* number of bytes to print at a time (1, 2, 4, 8) */
+#define HEX_GROUP_SIZE 1
+/* include ASCII after the hex output */
+#define HEX_ASCII 1
+/* max number of lines to be printed */
+#define HEX_MAX_LINES 2
+
+/* the list of all allocated objects */
+static LIST_HEAD(object_list);
+/* the list of gray-colored objects (see color_gray comment below) */
+static LIST_HEAD(gray_list);
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* rw_lock protecting the access to object_list and object_tree_root */
+static DEFINE_RWLOCK(kmemleak_lock);
+
+/* allocation caches for kmemleak internal data */
+static struct kmem_cache *object_cache;
+static struct kmem_cache *scan_area_cache;
+
+/* set if tracing memory operations is enabled */
+static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
+/* set in the late_initcall if there were no errors */
+static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
+/* enables or disables early logging of the memory operations */
+static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
+/* set if a kmemleak warning was issued */
+static atomic_t kmemleak_warning = ATOMIC_INIT(0);
+/* set if a fatal kmemleak error has occurred */
+static atomic_t kmemleak_error = ATOMIC_INIT(0);
+
+/* minimum and maximum address that may be valid pointers */
+static unsigned long min_addr = ULONG_MAX;
+static unsigned long max_addr;
+
+static struct task_struct *scan_thread;
+/* used to avoid reporting of recently allocated objects */
+static unsigned long jiffies_min_age;
+static unsigned long jiffies_last_scan;
+/* delay between automatic memory scannings */
+static signed long jiffies_scan_wait;
+/* enables or disables the task stacks scanning */
+static int kmemleak_stack_scan = 1;
+/* protects the memory scanning, parameters and debug/kmemleak file access */
+static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
+
+
+/*
+ * Early object allocation/freeing logging. Kmemleak is initialized after the
+ * kernel allocator. However, both the kernel allocator and kmemleak may
+ * allocate memory blocks which need to be tracked. Kmemleak defines an
+ * arbitrary buffer to hold the allocation/freeing information before it is
+ * fully initialized.
+ */
+
+/* kmemleak operation type for early logging */
+enum {
+ KMEMLEAK_ALLOC,
+ KMEMLEAK_ALLOC_PERCPU,
+ KMEMLEAK_FREE,
+ KMEMLEAK_FREE_PART,
+ KMEMLEAK_FREE_PERCPU,
+ KMEMLEAK_NOT_LEAK,
+ KMEMLEAK_IGNORE,
+ KMEMLEAK_SCAN_AREA,
+ KMEMLEAK_NO_SCAN
+};
+
+/*
+ * Structure holding the information passed to kmemleak callbacks during the
+ * early logging.
+ */
+struct early_log {
+ int op_type; /* kmemleak operation type */
+ const void *ptr; /* allocated/freed memory block */
+ size_t size; /* memory block size */
+ int min_count; /* minimum reference count */
+ unsigned long trace[MAX_TRACE]; /* stack trace */
+ unsigned int trace_len; /* stack trace length */
+};
+
+/* early logging buffer and current position */
+static struct early_log
+ early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
+static int crt_early_log __initdata;
+
+static void kmemleak_disable(void);
+
+/*
+ * Print a warning and dump the stack trace.
+ */
+#define kmemleak_warn(x...) do { \
+ pr_warning(x); \
+ dump_stack(); \
+ atomic_set(&kmemleak_warning, 1); \
+} while (0)
+
+/*
+ * Macro invoked when a serious kmemleak condition occurred and cannot be
+ * recovered from. Kmemleak will be disabled and further allocation/freeing
+ * tracing no longer available.
+ */
+#define kmemleak_stop(x...) do { \
+ kmemleak_warn(x); \
+ kmemleak_disable(); \
+} while (0)
+
+/*
+ * Printing of the objects hex dump to the seq file. The number of lines to be
+ * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
+ * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
+ * with the object->lock held.
+ */
+static void hex_dump_object(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ const u8 *ptr = (const u8 *)object->pointer;
+ int i, len, remaining;
+ unsigned char linebuf[HEX_ROW_SIZE * 5];
+
+ /* limit the number of lines to HEX_MAX_LINES */
+ remaining = len =
+ min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
+
+ seq_printf(seq, " hex dump (first %d bytes):\n", len);
+ for (i = 0; i < len; i += HEX_ROW_SIZE) {
+ int linelen = min(remaining, HEX_ROW_SIZE);
+
+ remaining -= HEX_ROW_SIZE;
+ hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
+ HEX_ASCII);
+ seq_printf(seq, " %s\n", linebuf);
+ }
+}
+
+/*
+ * Object colors, encoded with count and min_count:
+ * - white - orphan object, not enough references to it (count < min_count)
+ * - gray - not orphan, not marked as false positive (min_count == 0) or
+ * sufficient references to it (count >= min_count)
+ * - black - ignore, it doesn't contain references (e.g. text section)
+ * (min_count == -1). No function defined for this color.
+ * Newly created objects don't have any color assigned (object->count == -1)
+ * before the next memory scan when they become white.
+ */
+static bool color_white(const struct kmemleak_object *object)
+{
+ return object->count != KMEMLEAK_BLACK &&
+ object->count < object->min_count;
+}
+
+static bool color_gray(const struct kmemleak_object *object)
+{
+ return object->min_count != KMEMLEAK_BLACK &&
+ object->count >= object->min_count;
+}
+
+/*
+ * Objects are considered unreferenced only if their color is white, they have
+ * not be deleted and have a minimum age to avoid false positives caused by
+ * pointers temporarily stored in CPU registers.
+ */
+static bool unreferenced_object(struct kmemleak_object *object)
+{
+ return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
+ time_before_eq(object->jiffies + jiffies_min_age,
+ jiffies_last_scan);
+}
+
+/*
+ * Printing of the unreferenced objects information to the seq file. The
+ * print_unreferenced function must be called with the object->lock held.
+ */
+static void print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ int i;
+ unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
+
+ seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
+ object->pointer, object->size);
+ seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ object->comm, object->pid, object->jiffies,
+ msecs_age / 1000, msecs_age % 1000);
+ hex_dump_object(seq, object);
+ seq_printf(seq, " backtrace:\n");
+
+ for (i = 0; i < object->trace_len; i++) {
+ void *ptr = (void *)object->trace[i];
+ seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ }
+}
+
+/*
+ * Print the kmemleak_object information. This function is used mainly for
+ * debugging special cases when kmemleak operations. It must be called with
+ * the object->lock held.
+ */
+static void dump_object_info(struct kmemleak_object *object)
+{
+ struct stack_trace trace;
+
+ trace.nr_entries = object->trace_len;
+ trace.entries = object->trace;
+
+ pr_notice("Object 0x%08lx (size %zu):\n",
+ object->pointer, object->size);
+ pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
+ object->comm, object->pid, object->jiffies);
+ pr_notice(" min_count = %d\n", object->min_count);
+ pr_notice(" count = %d\n", object->count);
+ pr_notice(" flags = 0x%lx\n", object->flags);
+ pr_notice(" checksum = %d\n", object->checksum);
+ pr_notice(" backtrace:\n");
+ print_stack_trace(&trace, 4);
+}
+
+/*
+ * Look-up a memory block metadata (kmemleak_object) in the object search
+ * tree based on a pointer value. If alias is 0, only values pointing to the
+ * beginning of the memory block are allowed. The kmemleak_lock must be held
+ * when calling this function.
+ */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+ struct rb_node *rb = object_tree_root.rb_node;
+
+ while (rb) {
+ struct kmemleak_object *object =
+ rb_entry(rb, struct kmemleak_object, rb_node);
+ if (ptr < object->pointer)
+ rb = object->rb_node.rb_left;
+ else if (object->pointer + object->size <= ptr)
+ rb = object->rb_node.rb_right;
+ else if (object->pointer == ptr || alias)
+ return object;
+ else {
+ kmemleak_warn("Found object by alias at 0x%08lx\n",
+ ptr);
+ dump_object_info(object);
+ break;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
+ * that once an object's use_count reached 0, the RCU freeing was already
+ * registered and the object should no longer be used. This function must be
+ * called under the protection of rcu_read_lock().
+ */
+static int get_object(struct kmemleak_object *object)
+{
+ return atomic_inc_not_zero(&object->use_count);
+}
+
+/*
+ * RCU callback to free a kmemleak_object.
+ */
+static void free_object_rcu(struct rcu_head *rcu)
+{
+ struct hlist_node *tmp;
+ struct kmemleak_scan_area *area;
+ struct kmemleak_object *object =
+ container_of(rcu, struct kmemleak_object, rcu);
+
+ /*
+ * Once use_count is 0 (guaranteed by put_object), there is no other
+ * code accessing this object, hence no need for locking.
+ */
+ hlist_for_each_entry_safe(area, tmp, &object->area_list, node) {
+ hlist_del(&area->node);
+ kmem_cache_free(scan_area_cache, area);
+ }
+ kmem_cache_free(object_cache, object);
+}
+
+/*
+ * Decrement the object use_count. Once the count is 0, free the object using
+ * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
+ * delete_object() path, the delayed RCU freeing ensures that there is no
+ * recursive call to the kernel allocator. Lock-less RCU object_list traversal
+ * is also possible.
+ */
+static void put_object(struct kmemleak_object *object)
+{
+ if (!atomic_dec_and_test(&object->use_count))
+ return;
+
+ /* should only get here after delete_object was called */
+ WARN_ON(object->flags & OBJECT_ALLOCATED);
+
+ call_rcu(&object->rcu, free_object_rcu);
+}
+
+/*
+ * Look up an object in the object search tree and increase its use_count.
+ */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+ unsigned long flags;
+ struct kmemleak_object *object = NULL;
+
+ rcu_read_lock();
+ read_lock_irqsave(&kmemleak_lock, flags);
+ if (ptr >= min_addr && ptr < max_addr)
+ object = lookup_object(ptr, alias);
+ read_unlock_irqrestore(&kmemleak_lock, flags);
+
+ /* check whether the object is still available */
+ if (object && !get_object(object))
+ object = NULL;
+ rcu_read_unlock();
+
+ return object;
+}
+
+/*
+ * Save stack trace to the given array of MAX_TRACE size.
+ */
+static int __save_stack_trace(unsigned long *trace)
+{
+ struct stack_trace stack_trace;
+
+ stack_trace.max_entries = MAX_TRACE;
+ stack_trace.nr_entries = 0;
+ stack_trace.entries = trace;
+ stack_trace.skip = 2;
+ save_stack_trace(&stack_trace);
+
+ return stack_trace.nr_entries;
+}
+
+/*
+ * Create the metadata (struct kmemleak_object) corresponding to an allocated
+ * memory block and add it to the object_list and object_tree_root.
+ */
+static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object, *parent;
+ struct rb_node **link, *rb_parent;
+
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ if (!object) {
+ pr_warning("Cannot allocate a kmemleak_object structure\n");
+ kmemleak_disable();
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&object->object_list);
+ INIT_LIST_HEAD(&object->gray_list);
+ INIT_HLIST_HEAD(&object->area_list);
+ spin_lock_init(&object->lock);
+ atomic_set(&object->use_count, 1);
+ object->flags = OBJECT_ALLOCATED;
+ object->pointer = ptr;
+ object->size = size;
+ object->min_count = min_count;
+ object->count = 0; /* white color initially */
+ object->jiffies = jiffies;
+ object->checksum = 0;
+
+ /* task information */
+ if (in_irq()) {
+ object->pid = 0;
+ strncpy(object->comm, "hardirq", sizeof(object->comm));
+ } else if (in_softirq()) {
+ object->pid = 0;
+ strncpy(object->comm, "softirq", sizeof(object->comm));
+ } else {
+ object->pid = current->pid;
+ /*
+ * There is a small chance of a race with set_task_comm(),
+ * however using get_task_comm() here may cause locking
+ * dependency issues with current->alloc_lock. In the worst
+ * case, the command line is not correct.
+ */
+ strncpy(object->comm, current->comm, sizeof(object->comm));
+ }
+
+ /* kernel backtrace */
+ object->trace_len = __save_stack_trace(object->trace);
+
+ write_lock_irqsave(&kmemleak_lock, flags);
+
+ min_addr = min(min_addr, ptr);
+ max_addr = max(max_addr, ptr + size);
+ link = &object_tree_root.rb_node;
+ rb_parent = NULL;
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+ if (ptr + size <= parent->pointer)
+ link = &parent->rb_node.rb_left;
+ else if (parent->pointer + parent->size <= ptr)
+ link = &parent->rb_node.rb_right;
+ else {
+ kmemleak_stop("Cannot insert 0x%lx into the object "
+ "search tree (overlaps existing)\n",
+ ptr);
+ kmem_cache_free(object_cache, object);
+ object = parent;
+ spin_lock(&object->lock);
+ dump_object_info(object);
+ spin_unlock(&object->lock);
+ goto out;
+ }
+ }
+ rb_link_node(&object->rb_node, rb_parent, link);
+ rb_insert_color(&object->rb_node, &object_tree_root);
+
+ list_add_tail_rcu(&object->object_list, &object_list);
+out:
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+ return object;
+}
+
+/*
+ * Remove the metadata (struct kmemleak_object) for a memory block from the
+ * object_list and object_tree_root and decrement its use_count.
+ */
+static void __delete_object(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&kmemleak_lock, flags);
+ rb_erase(&object->rb_node, &object_tree_root);
+ list_del_rcu(&object->object_list);
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+
+ WARN_ON(!(object->flags & OBJECT_ALLOCATED));
+ WARN_ON(atomic_read(&object->use_count) < 2);
+
+ /*
+ * Locking here also ensures that the corresponding memory block
+ * cannot be freed when it is being scanned.
+ */
+ spin_lock_irqsave(&object->lock, flags);
+ object->flags &= ~OBJECT_ALLOCATED;
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it.
+ */
+static void delete_object_full(unsigned long ptr)
+{
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Freeing unknown object at 0x%08lx\n",
+ ptr);
+#endif
+ return;
+ }
+ __delete_object(object);
+ put_object(object);
+}
+
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it. If the memory block is partially freed, the function may create
+ * additional metadata for the remaining parts of the block.
+ */
+static void delete_object_part(unsigned long ptr, size_t size)
+{
+ struct kmemleak_object *object;
+ unsigned long start, end;
+
+ object = find_and_get_object(ptr, 1);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx "
+ "(size %zu)\n", ptr, size);
+#endif
+ return;
+ }
+ __delete_object(object);
+
+ /*
+ * Create one or two objects that may result from the memory block
+ * split. Note that partial freeing is only done by free_bootmem() and
+ * this happens before kmemleak_init() is called. The path below is
+ * only executed during early log recording in kmemleak_init(), so
+ * GFP_KERNEL is enough.
+ */
+ start = object->pointer;
+ end = object->pointer + object->size;
+ if (ptr > start)
+ create_object(start, ptr - start, object->min_count,
+ GFP_KERNEL);
+ if (ptr + size < end)
+ create_object(ptr + size, end - ptr - size, object->min_count,
+ GFP_KERNEL);
+
+ put_object(object);
+}
+
+static void __paint_it(struct kmemleak_object *object, int color)
+{
+ object->min_count = color;
+ if (color == KMEMLEAK_BLACK)
+ object->flags |= OBJECT_NO_SCAN;
+}
+
+static void paint_it(struct kmemleak_object *object, int color)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&object->lock, flags);
+ __paint_it(object, color);
+ spin_unlock_irqrestore(&object->lock, flags);
+}
+
+static void paint_ptr(unsigned long ptr, int color)
+{
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Trying to color unknown object "
+ "at 0x%08lx as %s\n", ptr,
+ (color == KMEMLEAK_GREY) ? "Grey" :
+ (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
+ return;
+ }
+ paint_it(object, color);
+ put_object(object);
+}
+
+/*
+ * Mark an object permanently as gray-colored so that it can no longer be
+ * reported as a leak. This is used in general to mark a false positive.
+ */
+static void make_gray_object(unsigned long ptr)
+{
+ paint_ptr(ptr, KMEMLEAK_GREY);
+}
+
+/*
+ * Mark the object as black-colored so that it is ignored from scans and
+ * reporting.
+ */
+static void make_black_object(unsigned long ptr)
+{
+ paint_ptr(ptr, KMEMLEAK_BLACK);
+}
+
+/*
+ * Add a scanning area to the object. If at least one such area is added,
+ * kmemleak will only scan these ranges rather than the whole memory block.
+ */
+static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ struct kmemleak_scan_area *area;
+
+ object = find_and_get_object(ptr, 1);
+ if (!object) {
+ kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
+ if (!area) {
+ pr_warning("Cannot allocate a scan area\n");
+ goto out;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ if (ptr + size > object->pointer + object->size) {
+ kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
+ dump_object_info(object);
+ kmem_cache_free(scan_area_cache, area);
+ goto out_unlock;
+ }
+
+ INIT_HLIST_NODE(&area->node);
+ area->start = ptr;
+ area->size = size;
+
+ hlist_add_head(&area->node, &object->area_list);
+out_unlock:
+ spin_unlock_irqrestore(&object->lock, flags);
+out:
+ put_object(object);
+}
+
+/*
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * pointer. Such object will not be scanned by kmemleak but references to it
+ * are searched.
+ */
+static void object_no_scan(unsigned long ptr)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr);
+ return;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ object->flags |= OBJECT_NO_SCAN;
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
+ * Log an early kmemleak_* call to the early_log buffer. These calls will be
+ * processed later once kmemleak is fully initialized.
+ */
+static void __init log_early(int op_type, const void *ptr, size_t size,
+ int min_count)
+{
+ unsigned long flags;
+ struct early_log *log;
+
+ if (atomic_read(&kmemleak_error)) {
+ /* kmemleak stopped recording, just count the requests */
+ crt_early_log++;
+ return;
+ }
+
+ if (crt_early_log >= ARRAY_SIZE(early_log)) {
+ kmemleak_disable();
+ return;
+ }
+
+ /*
+ * There is no need for locking since the kernel is still in UP mode
+ * at this stage. Disabling the IRQs is enough.
+ */
+ local_irq_save(flags);
+ log = &early_log[crt_early_log];
+ log->op_type = op_type;
+ log->ptr = ptr;
+ log->size = size;
+ log->min_count = min_count;
+ log->trace_len = __save_stack_trace(log->trace);
+ crt_early_log++;
+ local_irq_restore(flags);
+}
+
+/*
+ * Log an early allocated block and populate the stack trace.
+ */
+static void early_alloc(struct early_log *log)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+ int i;
+
+ if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
+ return;
+
+ /*
+ * RCU locking needed to ensure object is not freed via put_object().
+ */
+ rcu_read_lock();
+ object = create_object((unsigned long)log->ptr, log->size,
+ log->min_count, GFP_ATOMIC);
+ if (!object)
+ goto out;
+ spin_lock_irqsave(&object->lock, flags);
+ for (i = 0; i < log->trace_len; i++)
+ object->trace[i] = log->trace[i];
+ object->trace_len = log->trace_len;
+ spin_unlock_irqrestore(&object->lock, flags);
+out:
+ rcu_read_unlock();
+}
+
+/*
+ * Log an early allocated block and populate the stack trace.
+ */
+static void early_alloc_percpu(struct early_log *log)
+{
+ unsigned int cpu;
+ const void __percpu *ptr = log->ptr;
+
+ for_each_possible_cpu(cpu) {
+ log->ptr = per_cpu_ptr(ptr, cpu);
+ early_alloc(log);
+ }
+}
+
+/**
+ * kmemleak_alloc - register a newly allocated object
+ * @ptr: pointer to beginning of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object. If during memory
+ * scanning a number of references less than @min_count is found,
+ * the object is reported as a memory leak. If @min_count is 0,
+ * the object is never reported as a leak. If @min_count is -1,
+ * the object is ignored (not scanned and not reported as a leak)
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
+ */
+void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
+ gfp_t gfp)
+{
+ pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ create_object((unsigned long)ptr, size, min_count, gfp);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc);
+
+/**
+ * kmemleak_alloc_percpu - register a newly allocated __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ * @size: size of the object
+ *
+ * This function is called from the kernel percpu allocator when a new object
+ * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
+ * allocation.
+ */
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size);
+
+ /*
+ * Percpu allocations are only scanned and not reported as leaks
+ * (min_count is set to 0).
+ */
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ create_object((unsigned long)per_cpu_ptr(ptr, cpu),
+ size, 0, GFP_KERNEL);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
+
+/**
+ * kmemleak_free - unregister a previously registered object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
+ */
+void __ref kmemleak_free(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ delete_object_full((unsigned long)ptr);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_FREE, ptr, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free);
+
+/**
+ * kmemleak_free_part - partially unregister a previously registered object
+ * @ptr: pointer to the beginning or inside the object. This also
+ * represents the start of the range to be freed
+ * @size: size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
+ */
+void __ref kmemleak_free_part(const void *ptr, size_t size)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ delete_object_part((unsigned long)ptr, size);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_part);
+
+/**
+ * kmemleak_free_percpu - unregister a previously registered __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ *
+ * This function is called from the kernel percpu allocator when an object
+ * (memory block) is freed (free_percpu).
+ */
+void __ref kmemleak_free_percpu(const void __percpu *ptr)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ delete_object_full((unsigned long)per_cpu_ptr(ptr,
+ cpu));
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
+
+/**
+ * kmemleak_not_leak - mark an allocated object as false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
+ */
+void __ref kmemleak_not_leak(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ make_gray_object((unsigned long)ptr);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_not_leak);
+
+/**
+ * kmemleak_ignore - ignore an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
+ */
+void __ref kmemleak_ignore(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ make_black_object((unsigned long)ptr);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_ignore);
+
+/**
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr: pointer to beginning or inside the object. This also
+ * represents the start of the scan area
+ * @size: size of the scan area
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
+ */
+void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr))
+ add_scan_area((unsigned long)ptr, size, gfp);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
+}
+EXPORT_SYMBOL(kmemleak_scan_area);
+
+/**
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
+ */
+void __ref kmemleak_no_scan(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ object_no_scan((unsigned long)ptr);
+ else if (atomic_read(&kmemleak_early_log))
+ log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_no_scan);
+
+/*
+ * Update an object's checksum and return true if it was modified.
+ */
+static bool update_checksum(struct kmemleak_object *object)
+{
+ u32 old_csum = object->checksum;
+
+ if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
+ return false;
+
+ object->checksum = crc32(0, (void *)object->pointer, object->size);
+ return object->checksum != old_csum;
+}
+
+/*
+ * Memory scanning is a long process and it needs to be interruptable. This
+ * function checks whether such interrupt condition occurred.
+ */
+static int scan_should_stop(void)
+{
+ if (!atomic_read(&kmemleak_enabled))
+ return 1;
+
+ /*
+ * This function may be called from either process or kthread context,
+ * hence the need to check for both stop conditions.
+ */
+ if (current->mm)
+ return signal_pending(current);
+ else
+ return kthread_should_stop();
+
+ return 0;
+}
+
+/*
+ * Scan a memory block (exclusive range) for valid pointers and add those
+ * found to the gray list.
+ */
+static void scan_block(void *_start, void *_end,
+ struct kmemleak_object *scanned, int allow_resched)
+{
+ unsigned long *ptr;
+ unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
+ unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+
+ for (ptr = start; ptr < end; ptr++) {
+ struct kmemleak_object *object;
+ unsigned long flags;
+ unsigned long pointer;
+
+ if (allow_resched)
+ cond_resched();
+ if (scan_should_stop())
+ break;
+
+ /* don't scan uninitialized memory */
+ if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
+ BYTES_PER_POINTER))
+ continue;
+
+ pointer = *ptr;
+
+ object = find_and_get_object(pointer, 1);
+ if (!object)
+ continue;
+ if (object == scanned) {
+ /* self referenced, ignore */
+ put_object(object);
+ continue;
+ }
+
+ /*
+ * Avoid the lockdep recursive warning on object->lock being
+ * previously acquired in scan_object(). These locks are
+ * enclosed by scan_mutex.
+ */
+ spin_lock_irqsave_nested(&object->lock, flags,
+ SINGLE_DEPTH_NESTING);
+ if (!color_white(object)) {
+ /* non-orphan, ignored or new */
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+ continue;
+ }
+
+ /*
+ * Increase the object's reference count (number of pointers
+ * to the memory block). If this count reaches the required
+ * minimum, the object's color will become gray and it will be
+ * added to the gray_list.
+ */
+ object->count++;
+ if (color_gray(object)) {
+ list_add_tail(&object->gray_list, &gray_list);
+ spin_unlock_irqrestore(&object->lock, flags);
+ continue;
+ }
+
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+ }
+}
+
+/*
+ * Scan a memory block corresponding to a kmemleak_object. A condition is
+ * that object->use_count >= 1.
+ */
+static void scan_object(struct kmemleak_object *object)
+{
+ struct kmemleak_scan_area *area;
+ unsigned long flags;
+
+ /*
+ * Once the object->lock is acquired, the corresponding memory block
+ * cannot be freed (the same lock is acquired in delete_object).
+ */
+ spin_lock_irqsave(&object->lock, flags);
+ if (object->flags & OBJECT_NO_SCAN)
+ goto out;
+ if (!(object->flags & OBJECT_ALLOCATED))
+ /* already freed object */
+ goto out;
+ if (hlist_empty(&object->area_list)) {
+ void *start = (void *)object->pointer;
+ void *end = (void *)(object->pointer + object->size);
+
+ while (start < end && (object->flags & OBJECT_ALLOCATED) &&
+ !(object->flags & OBJECT_NO_SCAN)) {
+ scan_block(start, min(start + MAX_SCAN_SIZE, end),
+ object, 0);
+ start += MAX_SCAN_SIZE;
+
+ spin_unlock_irqrestore(&object->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&object->lock, flags);
+ }
+ } else
+ hlist_for_each_entry(area, &object->area_list, node)
+ scan_block((void *)area->start,
+ (void *)(area->start + area->size),
+ object, 0);
+out:
+ spin_unlock_irqrestore(&object->lock, flags);
+}
+
+/*
+ * Scan the objects already referenced (gray objects). More objects will be
+ * referenced and, if there are no memory leaks, all the objects are scanned.
+ */
+static void scan_gray_list(void)
+{
+ struct kmemleak_object *object, *tmp;
+
+ /*
+ * The list traversal is safe for both tail additions and removals
+ * from inside the loop. The kmemleak objects cannot be freed from
+ * outside the loop because their use_count was incremented.
+ */
+ object = list_entry(gray_list.next, typeof(*object), gray_list);
+ while (&object->gray_list != &gray_list) {
+ cond_resched();
+
+ /* may add new objects to the list */
+ if (!scan_should_stop())
+ scan_object(object);
+
+ tmp = list_entry(object->gray_list.next, typeof(*object),
+ gray_list);
+
+ /* remove the object from the list and release it */
+ list_del(&object->gray_list);
+ put_object(object);
+
+ object = tmp;
+ }
+ WARN_ON(!list_empty(&gray_list));
+}
+
+/*
+ * Scan data sections and all the referenced memory blocks allocated via the
+ * kernel's standard allocators. This function must be called with the
+ * scan_mutex held.
+ */
+static void kmemleak_scan(void)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ int i;
+ int new_leaks = 0;
+
+ jiffies_last_scan = jiffies;
+
+ /* prepare the kmemleak_object's */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+#ifdef DEBUG
+ /*
+ * With a few exceptions there should be a maximum of
+ * 1 reference to any object at this point.
+ */
+ if (atomic_read(&object->use_count) > 1) {
+ pr_debug("object->use_count = %d\n",
+ atomic_read(&object->use_count));
+ dump_object_info(object);
+ }
+#endif
+ /* reset the reference count (whiten the object) */
+ object->count = 0;
+ if (color_gray(object) && get_object(object))
+ list_add_tail(&object->gray_list, &gray_list);
+
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ /* data/bss scanning */
+ scan_block(_sdata, _edata, NULL, 1);
+ scan_block(__bss_start, __bss_stop, NULL, 1);
+
+#ifdef CONFIG_SMP
+ /* per-cpu sections scanning */
+ for_each_possible_cpu(i)
+ scan_block(__per_cpu_start + per_cpu_offset(i),
+ __per_cpu_end + per_cpu_offset(i), NULL, 1);
+#endif
+
+ /*
+ * Struct page scanning for each node.
+ */
+ lock_memory_hotplug();
+ for_each_online_node(i) {
+ unsigned long start_pfn = node_start_pfn(i);
+ unsigned long end_pfn = node_end_pfn(i);
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ /* only scan if page is in use */
+ if (page_count(page) == 0)
+ continue;
+ scan_block(page, page + 1, NULL, 1);
+ }
+ }
+ unlock_memory_hotplug();
+
+ /*
+ * Scanning the task stacks (may introduce false negatives).
+ */
+ if (kmemleak_stack_scan) {
+ struct task_struct *p, *g;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ scan_block(task_stack_page(p), task_stack_page(p) +
+ THREAD_SIZE, NULL, 0);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ }
+
+ /*
+ * Scan the objects already referenced from the sections scanned
+ * above.
+ */
+ scan_gray_list();
+
+ /*
+ * Check for new or unreferenced objects modified since the previous
+ * scan and color them gray until the next scan.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+ if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
+ && update_checksum(object) && get_object(object)) {
+ /* color it gray temporarily */
+ object->count = object->min_count;
+ list_add_tail(&object->gray_list, &gray_list);
+ }
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ /*
+ * Re-scan the gray list for modified unreferenced objects.
+ */
+ scan_gray_list();
+
+ /*
+ * If scanning was stopped do not report any new unreferenced objects.
+ */
+ if (scan_should_stop())
+ return;
+
+ /*
+ * Scanning result reporting.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+ if (unreferenced_object(object) &&
+ !(object->flags & OBJECT_REPORTED)) {
+ object->flags |= OBJECT_REPORTED;
+ new_leaks++;
+ }
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ if (new_leaks)
+ pr_info("%d new suspected memory leaks (see "
+ "/sys/kernel/debug/kmemleak)\n", new_leaks);
+
+}
+
+/*
+ * Thread function performing automatic memory scanning. Unreferenced objects
+ * at the end of a memory scan are reported but only the first time.
+ */
+static int kmemleak_scan_thread(void *arg)
+{
+ static int first_run = 1;
+
+ pr_info("Automatic memory scanning thread started\n");
+ set_user_nice(current, 10);
+
+ /*
+ * Wait before the first scan to allow the system to fully initialize.
+ */
+ if (first_run) {
+ first_run = 0;
+ ssleep(SECS_FIRST_SCAN);
+ }
+
+ while (!kthread_should_stop()) {
+ signed long timeout = jiffies_scan_wait;
+
+ mutex_lock(&scan_mutex);
+ kmemleak_scan();
+ mutex_unlock(&scan_mutex);
+
+ /* wait before the next scan */
+ while (timeout && !kthread_should_stop())
+ timeout = schedule_timeout_interruptible(timeout);
+ }
+
+ pr_info("Automatic memory scanning thread ended\n");
+
+ return 0;
+}
+
+/*
+ * Start the automatic memory scanning thread. This function must be called
+ * with the scan_mutex held.
+ */
+static void start_scan_thread(void)
+{
+ if (scan_thread)
+ return;
+ scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
+ if (IS_ERR(scan_thread)) {
+ pr_warning("Failed to create the scan thread\n");
+ scan_thread = NULL;
+ }
+}
+
+/*
+ * Stop the automatic memory scanning thread. This function must be called
+ * with the scan_mutex held.
+ */
+static void stop_scan_thread(void)
+{
+ if (scan_thread) {
+ kthread_stop(scan_thread);
+ scan_thread = NULL;
+ }
+}
+
+/*
+ * Iterate over the object_list and return the first valid object at or after
+ * the required position with its use_count incremented. The function triggers
+ * a memory scanning when the pos argument points to the first position.
+ */
+static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct kmemleak_object *object;
+ loff_t n = *pos;
+ int err;
+
+ err = mutex_lock_interruptible(&scan_mutex);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ if (n-- > 0)
+ continue;
+ if (get_object(object))
+ goto out;
+ }
+ object = NULL;
+out:
+ return object;
+}
+
+/*
+ * Return the next object in the object_list. The function decrements the
+ * use_count of the previous object and increases that of the next one.
+ */
+static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct kmemleak_object *prev_obj = v;
+ struct kmemleak_object *next_obj = NULL;
+ struct kmemleak_object *obj = prev_obj;
+
+ ++(*pos);
+
+ list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
+ if (get_object(obj)) {
+ next_obj = obj;
+ break;
+ }
+ }
+
+ put_object(prev_obj);
+ return next_obj;
+}
+
+/*
+ * Decrement the use_count of the last object required, if any.
+ */
+static void kmemleak_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!IS_ERR(v)) {
+ /*
+ * kmemleak_seq_start may return ERR_PTR if the scan_mutex
+ * waiting was interrupted, so only release it if !IS_ERR.
+ */
+ rcu_read_unlock();
+ mutex_unlock(&scan_mutex);
+ if (v)
+ put_object(v);
+ }
+}
+
+/*
+ * Print the information for an unreferenced object to the seq file.
+ */
+static int kmemleak_seq_show(struct seq_file *seq, void *v)
+{
+ struct kmemleak_object *object = v;
+ unsigned long flags;
+
+ spin_lock_irqsave(&object->lock, flags);
+ if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
+ print_unreferenced(seq, object);
+ spin_unlock_irqrestore(&object->lock, flags);
+ return 0;
+}
+
+static const struct seq_operations kmemleak_seq_ops = {
+ .start = kmemleak_seq_start,
+ .next = kmemleak_seq_next,
+ .stop = kmemleak_seq_stop,
+ .show = kmemleak_seq_show,
+};
+
+static int kmemleak_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &kmemleak_seq_ops);
+}
+
+static int kmemleak_release(struct inode *inode, struct file *file)
+{
+ return seq_release(inode, file);
+}
+
+static int dump_str_object_info(const char *str)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ unsigned long addr;
+
+ if (kstrtoul(str, 0, &addr))
+ return -EINVAL;
+ object = find_and_get_object(addr, 0);
+ if (!object) {
+ pr_info("Unknown object at 0x%08lx\n", addr);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ dump_object_info(object);
+ spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+ return 0;
+}
+
+/*
+ * We use grey instead of black to ensure we can do future scans on the same
+ * objects. If we did not do future scans these black objects could
+ * potentially contain references to newly allocated objects in the future and
+ * we'd end up with false positives.
+ */
+static void kmemleak_clear(void)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+ if ((object->flags & OBJECT_REPORTED) &&
+ unreferenced_object(object))
+ __paint_it(object, KMEMLEAK_GREY);
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * File write operation to configure kmemleak at run-time. The following
+ * commands can be written to the /sys/kernel/debug/kmemleak file:
+ * off - disable kmemleak (irreversible)
+ * stack=on - enable the task stacks scanning
+ * stack=off - disable the tasks stacks scanning
+ * scan=on - start the automatic memory scanning thread
+ * scan=off - stop the automatic memory scanning thread
+ * scan=... - set the automatic memory scanning period in seconds (0 to
+ * disable it)
+ * scan - trigger a memory scan
+ * clear - mark all current reported unreferenced kmemleak objects as
+ * grey to ignore printing them
+ * dump=... - dump information about the object found at the given address
+ */
+static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
+ size_t size, loff_t *ppos)
+{
+ char buf[64];
+ int buf_size;
+ int ret;
+
+ if (!atomic_read(&kmemleak_enabled))
+ return -EBUSY;
+
+ buf_size = min(size, (sizeof(buf) - 1));
+ if (strncpy_from_user(buf, user_buf, buf_size) < 0)
+ return -EFAULT;
+ buf[buf_size] = 0;
+
+ ret = mutex_lock_interruptible(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
+ if (strncmp(buf, "off", 3) == 0)
+ kmemleak_disable();
+ else if (strncmp(buf, "stack=on", 8) == 0)
+ kmemleak_stack_scan = 1;
+ else if (strncmp(buf, "stack=off", 9) == 0)
+ kmemleak_stack_scan = 0;
+ else if (strncmp(buf, "scan=on", 7) == 0)
+ start_scan_thread();
+ else if (strncmp(buf, "scan=off", 8) == 0)
+ stop_scan_thread();
+ else if (strncmp(buf, "scan=", 5) == 0) {
+ unsigned long secs;
+
+ ret = strict_strtoul(buf + 5, 0, &secs);
+ if (ret < 0)
+ goto out;
+ stop_scan_thread();
+ if (secs) {
+ jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+ start_scan_thread();
+ }
+ } else if (strncmp(buf, "scan", 4) == 0)
+ kmemleak_scan();
+ else if (strncmp(buf, "clear", 5) == 0)
+ kmemleak_clear();
+ else if (strncmp(buf, "dump=", 5) == 0)
+ ret = dump_str_object_info(buf + 5);
+ else
+ ret = -EINVAL;
+
+out:
+ mutex_unlock(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
+ /* ignore the rest of the buffer, only one command at a time */
+ *ppos += size;
+ return size;
+}
+
+static const struct file_operations kmemleak_fops = {
+ .owner = THIS_MODULE,
+ .open = kmemleak_open,
+ .read = seq_read,
+ .write = kmemleak_write,
+ .llseek = seq_lseek,
+ .release = kmemleak_release,
+};
+
+/*
+ * Stop the memory scanning thread and free the kmemleak internal objects if
+ * no previous scan thread (otherwise, kmemleak may still have some useful
+ * information on memory leaks).
+ */
+static void kmemleak_do_cleanup(struct work_struct *work)
+{
+ struct kmemleak_object *object;
+ bool cleanup = scan_thread == NULL;
+
+ mutex_lock(&scan_mutex);
+ stop_scan_thread();
+
+ if (cleanup) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list)
+ delete_object_full(object->pointer);
+ rcu_read_unlock();
+ }
+ mutex_unlock(&scan_mutex);
+}
+
+static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
+
+/*
+ * Disable kmemleak. No memory allocation/freeing will be traced once this
+ * function is called. Disabling kmemleak is an irreversible operation.
+ */
+static void kmemleak_disable(void)
+{
+ /* atomically check whether it was already invoked */
+ if (atomic_cmpxchg(&kmemleak_error, 0, 1))
+ return;
+
+ /* stop any memory operation tracing */
+ atomic_set(&kmemleak_enabled, 0);
+
+ /* check whether it is too early for a kernel thread */
+ if (atomic_read(&kmemleak_initialized))
+ schedule_work(&cleanup_work);
+
+ pr_info("Kernel memory leak detector disabled\n");
+}
+
+/*
+ * Allow boot-time kmemleak disabling (enabled by default).
+ */
+static int kmemleak_boot_config(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (strcmp(str, "off") == 0)
+ kmemleak_disable();
+ else if (strcmp(str, "on") == 0)
+ kmemleak_skip_disable = 1;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("kmemleak", kmemleak_boot_config);
+
+static void __init print_log_trace(struct early_log *log)
+{
+ struct stack_trace trace;
+
+ trace.nr_entries = log->trace_len;
+ trace.entries = log->trace;
+
+ pr_notice("Early log backtrace:\n");
+ print_stack_trace(&trace, 2);
+}
+
+/*
+ * Kmemleak initialization.
+ */
+void __init kmemleak_init(void)
+{
+ int i;
+ unsigned long flags;
+
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+ if (!kmemleak_skip_disable) {
+ atomic_set(&kmemleak_early_log, 0);
+ kmemleak_disable();
+ return;
+ }
+#endif
+
+ jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
+ jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+
+ object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
+ scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
+
+ if (crt_early_log >= ARRAY_SIZE(early_log))
+ pr_warning("Early log buffer exceeded (%d), please increase "
+ "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
+
+ /* the kernel is still in UP mode, so disabling the IRQs is enough */
+ local_irq_save(flags);
+ atomic_set(&kmemleak_early_log, 0);
+ if (atomic_read(&kmemleak_error)) {
+ local_irq_restore(flags);
+ return;
+ } else
+ atomic_set(&kmemleak_enabled, 1);
+ local_irq_restore(flags);
+
+ /*
+ * This is the point where tracking allocations is safe. Automatic
+ * scanning is started during the late initcall. Add the early logged
+ * callbacks to the kmemleak infrastructure.
+ */
+ for (i = 0; i < crt_early_log; i++) {
+ struct early_log *log = &early_log[i];
+
+ switch (log->op_type) {
+ case KMEMLEAK_ALLOC:
+ early_alloc(log);
+ break;
+ case KMEMLEAK_ALLOC_PERCPU:
+ early_alloc_percpu(log);
+ break;
+ case KMEMLEAK_FREE:
+ kmemleak_free(log->ptr);
+ break;
+ case KMEMLEAK_FREE_PART:
+ kmemleak_free_part(log->ptr, log->size);
+ break;
+ case KMEMLEAK_FREE_PERCPU:
+ kmemleak_free_percpu(log->ptr);
+ break;
+ case KMEMLEAK_NOT_LEAK:
+ kmemleak_not_leak(log->ptr);
+ break;
+ case KMEMLEAK_IGNORE:
+ kmemleak_ignore(log->ptr);
+ break;
+ case KMEMLEAK_SCAN_AREA:
+ kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
+ break;
+ case KMEMLEAK_NO_SCAN:
+ kmemleak_no_scan(log->ptr);
+ break;
+ default:
+ kmemleak_warn("Unknown early log operation: %d\n",
+ log->op_type);
+ }
+
+ if (atomic_read(&kmemleak_warning)) {
+ print_log_trace(log);
+ atomic_set(&kmemleak_warning, 0);
+ }
+ }
+}
+
+/*
+ * Late initialization function.
+ */
+static int __init kmemleak_late_init(void)
+{
+ struct dentry *dentry;
+
+ atomic_set(&kmemleak_initialized, 1);
+
+ if (atomic_read(&kmemleak_error)) {
+ /*
+ * Some error occurred and kmemleak was disabled. There is a
+ * small chance that kmemleak_disable() was called immediately
+ * after setting kmemleak_initialized and we may end up with
+ * two clean-up threads but serialized by scan_mutex.
+ */
+ schedule_work(&cleanup_work);
+ return -ENOMEM;
+ }
+
+ dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
+ &kmemleak_fops);
+ if (!dentry)
+ pr_warning("Failed to create the debugfs kmemleak file\n");
+ mutex_lock(&scan_mutex);
+ start_scan_thread();
+ mutex_unlock(&scan_mutex);
+
+ pr_info("Kernel memory leak detector initialized\n");
+
+ return 0;
+}
+late_initcall(kmemleak_late_init);
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 00000000..b6afe0c4
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,2441 @@
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork()
+ *
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Authors:
+ * Izik Eidus
+ * Andrea Arcangeli
+ * Chris Wright
+ * Hugh Dickins
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/memory.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+#include <linux/hashtable.h>
+#include <linux/freezer.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
+
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+#ifdef CONFIG_NUMA
+#define NUMA(x) (x)
+#define DO_NUMA(x) do { (x); } while (0)
+#else
+#define NUMA(x) (0)
+#define DO_NUMA(x) do { } while (0)
+#endif
+
+/*
+ * A few notes about the KSM scanning process,
+ * to make it easier to understand the data structures below:
+ *
+ * In order to reduce excessive scanning, KSM sorts the memory pages by their
+ * contents into a data structure that holds pointers to the pages' locations.
+ *
+ * Since the contents of the pages may change at any moment, KSM cannot just
+ * insert the pages into a normal sorted tree and expect it to find anything.
+ * Therefore KSM uses two data structures - the stable and the unstable tree.
+ *
+ * The stable tree holds pointers to all the merged pages (ksm pages), sorted
+ * by their contents. Because each such page is write-protected, searching on
+ * this tree is fully assured to be working (except when pages are unmapped),
+ * and therefore this tree is called the stable tree.
+ *
+ * In addition to the stable tree, KSM uses a second data structure called the
+ * unstable tree: this tree holds pointers to pages which have been found to
+ * be "unchanged for a period of time". The unstable tree sorts these pages
+ * by their contents, but since they are not write-protected, KSM cannot rely
+ * upon the unstable tree to work correctly - the unstable tree is liable to
+ * be corrupted as its contents are modified, and so it is called unstable.
+ *
+ * KSM solves this problem by several techniques:
+ *
+ * 1) The unstable tree is flushed every time KSM completes scanning all
+ * memory areas, and then the tree is rebuilt again from the beginning.
+ * 2) KSM will only insert into the unstable tree, pages whose hash value
+ * has not changed since the previous scan of all memory areas.
+ * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
+ * colors of the nodes and not on their contents, assuring that even when
+ * the tree gets "corrupted" it won't get out of balance, so scanning time
+ * remains the same (also, searching and inserting nodes in an rbtree uses
+ * the same algorithm, so we have no overhead when we flush and rebuild).
+ * 4) KSM never flushes the stable tree, which means that even if it were to
+ * take 10 attempts to find a page in the unstable tree, once it is found,
+ * it is secured in the stable tree. (When we scan a new page, we first
+ * compare it against the stable tree, and then against the unstable tree.)
+ *
+ * If the merge_across_nodes tunable is unset, then KSM maintains multiple
+ * stable trees and multiple unstable trees: one of each for each NUMA node.
+ */
+
+/**
+ * struct mm_slot - ksm information per mm that is being scanned
+ * @link: link to the mm_slots hash list
+ * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node link;
+ struct list_head mm_list;
+ struct rmap_item *rmap_list;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct ksm_scan - cursor for scanning
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ * @rmap_list: link to the next rmap to be scanned in the rmap_list
+ * @seqnr: count of completed full scans (needed when removing unstable node)
+ *
+ * There is only the one ksm_scan instance of this cursor structure.
+ */
+struct ksm_scan {
+ struct mm_slot *mm_slot;
+ unsigned long address;
+ struct rmap_item **rmap_list;
+ unsigned long seqnr;
+};
+
+/**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @list: linked into migrate_nodes, pending placement in the proper node tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
+ */
+struct stable_node {
+ union {
+ struct rb_node node; /* when node of stable tree */
+ struct { /* when listed for migration */
+ struct list_head *head;
+ struct list_head list;
+ };
+ };
+ struct hlist_head hlist;
+ unsigned long kpfn;
+#ifdef CONFIG_NUMA
+ int nid;
+#endif
+};
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @nid: NUMA node id of unstable tree in which linked (may not match page)
+ * @mm: the memory structure this rmap_item is pointing into
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
+ * @oldchecksum: previous checksum of the page at that virtual address
+ * @node: rb node of this rmap_item in the unstable tree
+ * @head: pointer to stable_node heading this list in the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
+ */
+struct rmap_item {
+ struct rmap_item *rmap_list;
+ union {
+ struct anon_vma *anon_vma; /* when stable */
+#ifdef CONFIG_NUMA
+ int nid; /* when node of unstable tree */
+#endif
+ };
+ struct mm_struct *mm;
+ unsigned long address; /* + low bits used for flags below */
+ unsigned int oldchecksum; /* when unstable */
+ union {
+ struct rb_node node; /* when node of unstable tree */
+ struct { /* when listed from stable tree */
+ struct stable_node *head;
+ struct hlist_node hlist;
+ };
+ };
+};
+
+#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
+#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
+#define STABLE_FLAG 0x200 /* is listed from the stable tree */
+
+/* The stable and unstable tree heads */
+static struct rb_root one_stable_tree[1] = { RB_ROOT };
+static struct rb_root one_unstable_tree[1] = { RB_ROOT };
+static struct rb_root *root_stable_tree = one_stable_tree;
+static struct rb_root *root_unstable_tree = one_unstable_tree;
+
+/* Recently migrated nodes of stable tree, pending proper placement */
+static LIST_HEAD(migrate_nodes);
+
+#define MM_SLOTS_HASH_BITS 10
+static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
+static struct mm_slot ksm_mm_head = {
+ .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
+};
+static struct ksm_scan ksm_scan = {
+ .mm_slot = &ksm_mm_head,
+};
+
+static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *stable_node_cache;
+static struct kmem_cache *mm_slot_cache;
+
+/* The number of nodes in the stable tree */
+static unsigned long ksm_pages_shared;
+
+/* The number of page slots additionally sharing those nodes */
+static unsigned long ksm_pages_sharing;
+
+/* The number of nodes in the unstable tree */
+static unsigned long ksm_pages_unshared;
+
+/* The number of rmap_items in use: to calculate pages_volatile */
+static unsigned long ksm_rmap_items;
+
+/* Number of pages ksmd should scan in one batch */
+static unsigned int ksm_thread_pages_to_scan = 100;
+
+/* Milliseconds ksmd should sleep between batches */
+static unsigned int ksm_thread_sleep_millisecs = 20;
+
+#ifdef CONFIG_NUMA
+/* Zeroed when merging across nodes is not allowed */
+static unsigned int ksm_merge_across_nodes = 1;
+static int ksm_nr_node_ids = 1;
+#else
+#define ksm_merge_across_nodes 1U
+#define ksm_nr_node_ids 1
+#endif
+
+#define KSM_RUN_STOP 0
+#define KSM_RUN_MERGE 1
+#define KSM_RUN_UNMERGE 2
+#define KSM_RUN_OFFLINE 4
+static unsigned long ksm_run = KSM_RUN_STOP;
+static void wait_while_offlining(void);
+
+static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DEFINE_MUTEX(ksm_thread_mutex);
+static DEFINE_SPINLOCK(ksm_mmlist_lock);
+
+#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
+ sizeof(struct __struct), __alignof__(struct __struct),\
+ (__flags), NULL)
+
+static int __init ksm_slab_init(void)
+{
+ rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
+ if (!rmap_item_cache)
+ goto out;
+
+ stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+ if (!stable_node_cache)
+ goto out_free1;
+
+ mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
+ if (!mm_slot_cache)
+ goto out_free2;
+
+ return 0;
+
+out_free2:
+ kmem_cache_destroy(stable_node_cache);
+out_free1:
+ kmem_cache_destroy(rmap_item_cache);
+out:
+ return -ENOMEM;
+}
+
+static void __init ksm_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ kmem_cache_destroy(stable_node_cache);
+ kmem_cache_destroy(rmap_item_cache);
+ mm_slot_cache = NULL;
+}
+
+static inline struct rmap_item *alloc_rmap_item(void)
+{
+ struct rmap_item *rmap_item;
+
+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
+ if (rmap_item)
+ ksm_rmap_items++;
+ return rmap_item;
+}
+
+static inline void free_rmap_item(struct rmap_item *rmap_item)
+{
+ ksm_rmap_items--;
+ rmap_item->mm = NULL; /* debug safety */
+ kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
+static inline struct stable_node *alloc_stable_node(void)
+{
+ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
+}
+
+static inline void free_stable_node(struct stable_node *stable_node)
+{
+ kmem_cache_free(stable_node_cache, stable_node);
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *slot;
+
+ hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
+ if (slot->mm == mm)
+ return slot;
+
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ mm_slot->mm = mm;
+ hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
+}
+
+/*
+ * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
+ * page tables after it has passed through ksm_exit() - which, if necessary,
+ * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
+ * a special flag: they can just back out as soon as mm_users goes to zero.
+ * ksm_test_exit() is used throughout to make this test for exit: in some
+ * places for correctness, in some places just to avoid unnecessary work.
+ */
+static inline bool ksm_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+/*
+ * We use break_ksm to break COW on a ksm page: it's a stripped down
+ *
+ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
+ * put_page(page);
+ *
+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * in case the application has unmapped and remapped mm,addr meanwhile.
+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ */
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *page;
+ int ret = 0;
+
+ do {
+ cond_resched();
+ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
+ if (IS_ERR_OR_NULL(page))
+ break;
+ if (PageKsm(page))
+ ret = handle_mm_fault(vma->vm_mm, vma, addr,
+ FAULT_FLAG_WRITE);
+ else
+ ret = VM_FAULT_WRITE;
+ put_page(page);
+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+ /*
+ * We must loop because handle_mm_fault() may back out if there's
+ * any difficulty e.g. if pte accessed bit gets updated concurrently.
+ *
+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
+ * COW has been broken, even if the vma does not permit VM_WRITE;
+ * but note that a concurrent fault might break PageKsm for us.
+ *
+ * VM_FAULT_SIGBUS could occur if we race with truncation of the
+ * backing file, which also invalidates anonymous pages: that's
+ * okay, that truncation will have unmapped the PageKsm for us.
+ *
+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
+ * current task has TIF_MEMDIE set, and will be OOM killed on return
+ * to user; and ksmd, having no mm, would never be chosen for that.
+ *
+ * But if the mm is in a limited mem_cgroup, then the fault may fail
+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
+ * even ksmd can fail in this way - though it's usually breaking ksm
+ * just to undo a merge it made a moment before, so unlikely to oom.
+ *
+ * That's a pity: we might therefore have more kernel pages allocated
+ * than we're counting as nodes in the stable tree; but ksm_do_scan
+ * will retry to break_cow on each pass, so should recover the page
+ * in due course. The important thing is to not let VM_MERGEABLE
+ * be cleared while any such pages might remain in the area.
+ */
+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
+}
+
+static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ if (ksm_test_exit(mm))
+ return NULL;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ return NULL;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ return NULL;
+ return vma;
+}
+
+static void break_cow(struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
+ struct vm_area_struct *vma;
+
+ /*
+ * It is not an accident that whenever we want to break COW
+ * to undo, we also need to drop a reference to the anon_vma.
+ */
+ put_anon_vma(rmap_item->anon_vma);
+
+ down_read(&mm->mmap_sem);
+ vma = find_mergeable_vma(mm, addr);
+ if (vma)
+ break_ksm(vma, addr);
+ up_read(&mm->mmap_sem);
+}
+
+static struct page *page_trans_compound_anon(struct page *page)
+{
+ if (PageTransCompound(page)) {
+ struct page *head = compound_trans_head(page);
+ /*
+ * head may actually be splitted and freed from under
+ * us but it's ok here.
+ */
+ if (PageAnon(head))
+ return head;
+ }
+ return NULL;
+}
+
+static struct page *get_mergeable_page(struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ down_read(&mm->mmap_sem);
+ vma = find_mergeable_vma(mm, addr);
+ if (!vma)
+ goto out;
+
+ page = follow_page(vma, addr, FOLL_GET);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+ if (PageAnon(page) || page_trans_compound_anon(page)) {
+ flush_anon_page(vma, page, addr);
+ flush_dcache_page(page);
+ } else {
+ put_page(page);
+out: page = NULL;
+ }
+ up_read(&mm->mmap_sem);
+ return page;
+}
+
+/*
+ * This helper is used for getting right index into array of tree roots.
+ * When merge_across_nodes knob is set to 1, there are only two rb-trees for
+ * stable and unstable pages from all nodes with roots in index 0. Otherwise,
+ * every node has its own stable and unstable tree.
+ */
+static inline int get_kpfn_nid(unsigned long kpfn)
+{
+ return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
+}
+
+static void remove_node_from_stable_tree(struct stable_node *stable_node)
+{
+ struct rmap_item *rmap_item;
+
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+ put_anon_vma(rmap_item->anon_vma);
+ rmap_item->address &= PAGE_MASK;
+ cond_resched();
+ }
+
+ if (stable_node->head == &migrate_nodes)
+ list_del(&stable_node->list);
+ else
+ rb_erase(&stable_node->node,
+ root_stable_tree + NUMA(stable_node->nid));
+ free_stable_node(stable_node);
+}
+
+/*
+ * get_ksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ * But beware, the stable node's page might be being migrated.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive. So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node. This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ * is on its way to being freed; but it is an anomaly to bear in mind.
+ */
+static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
+{
+ struct page *page;
+ void *expected_mapping;
+ unsigned long kpfn;
+
+ expected_mapping = (void *)stable_node +
+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+again:
+ kpfn = ACCESS_ONCE(stable_node->kpfn);
+ page = pfn_to_page(kpfn);
+
+ /*
+ * page is computed from kpfn, so on most architectures reading
+ * page->mapping is naturally ordered after reading node->kpfn,
+ * but on Alpha we need to be more careful.
+ */
+ smp_read_barrier_depends();
+ if (ACCESS_ONCE(page->mapping) != expected_mapping)
+ goto stale;
+
+ /*
+ * We cannot do anything with the page while its refcount is 0.
+ * Usually 0 means free, or tail of a higher-order page: in which
+ * case this node is no longer referenced, and should be freed;
+ * however, it might mean that the page is under page_freeze_refs().
+ * The __remove_mapping() case is easy, again the node is now stale;
+ * but if page is swapcache in migrate_page_move_mapping(), it might
+ * still be our page, in which case it's essential to keep the node.
+ */
+ while (!get_page_unless_zero(page)) {
+ /*
+ * Another check for page->mapping != expected_mapping would
+ * work here too. We have chosen the !PageSwapCache test to
+ * optimize the common case, when the page is or is about to
+ * be freed: PageSwapCache is cleared (under spin_lock_irq)
+ * in the freeze_refs section of __remove_mapping(); but Anon
+ * page->mapping reset to NULL later, in free_pages_prepare().
+ */
+ if (!PageSwapCache(page))
+ goto stale;
+ cpu_relax();
+ }
+
+ if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ put_page(page);
+ goto stale;
+ }
+
+ if (lock_it) {
+ lock_page(page);
+ if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto stale;
+ }
+ }
+ return page;
+
+stale:
+ /*
+ * We come here from above when page->mapping or !PageSwapCache
+ * suggests that the node is stale; but it might be under migration.
+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+ * before checking whether node->kpfn has been changed.
+ */
+ smp_rmb();
+ if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+ goto again;
+ remove_node_from_stable_tree(stable_node);
+ return NULL;
+}
+
+/*
+ * Removing rmap_item from stable or unstable tree.
+ * This function will clean the information from the stable/unstable tree.
+ */
+static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+{
+ if (rmap_item->address & STABLE_FLAG) {
+ struct stable_node *stable_node;
+ struct page *page;
+
+ stable_node = rmap_item->head;
+ page = get_ksm_page(stable_node, true);
+ if (!page)
+ goto out;
+
+ hlist_del(&rmap_item->hlist);
+ unlock_page(page);
+ put_page(page);
+
+ if (stable_node->hlist.first)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+
+ put_anon_vma(rmap_item->anon_vma);
+ rmap_item->address &= PAGE_MASK;
+
+ } else if (rmap_item->address & UNSTABLE_FLAG) {
+ unsigned char age;
+ /*
+ * Usually ksmd can and must skip the rb_erase, because
+ * root_unstable_tree was already reset to RB_ROOT.
+ * But be careful when an mm is exiting: do the rb_erase
+ * if this rmap_item was inserted by this scan, rather
+ * than left over from before.
+ */
+ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
+ BUG_ON(age > 1);
+ if (!age)
+ rb_erase(&rmap_item->node,
+ root_unstable_tree + NUMA(rmap_item->nid));
+ ksm_pages_unshared--;
+ rmap_item->address &= PAGE_MASK;
+ }
+out:
+ cond_resched(); /* we're called from many long loops */
+}
+
+static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
+ struct rmap_item **rmap_list)
+{
+ while (*rmap_list) {
+ struct rmap_item *rmap_item = *rmap_list;
+ *rmap_list = rmap_item->rmap_list;
+ remove_rmap_item_from_tree(rmap_item);
+ free_rmap_item(rmap_item);
+ }
+}
+
+/*
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ */
+static int unmerge_ksm_pages(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+ int err = 0;
+
+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+ if (ksm_test_exit(vma->vm_mm))
+ break;
+ if (signal_pending(current))
+ err = -ERESTARTSYS;
+ else
+ err = break_ksm(vma, addr);
+ }
+ return err;
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * Only called through the sysfs control interface:
+ */
+static int remove_stable_node(struct stable_node *stable_node)
+{
+ struct page *page;
+ int err;
+
+ page = get_ksm_page(stable_node, true);
+ if (!page) {
+ /*
+ * get_ksm_page did remove_node_from_stable_tree itself.
+ */
+ return 0;
+ }
+
+ if (WARN_ON_ONCE(page_mapped(page))) {
+ /*
+ * This should not happen: but if it does, just refuse to let
+ * merge_across_nodes be switched - there is no need to panic.
+ */
+ err = -EBUSY;
+ } else {
+ /*
+ * The stable node did not yet appear stale to get_ksm_page(),
+ * since that allows for an unmapped ksm page to be recognized
+ * right up until it is freed; but the node is safe to remove.
+ * This page might be in a pagevec waiting to be freed,
+ * or it might be PageSwapCache (perhaps under writeback),
+ * or it might have been removed from swapcache a moment ago.
+ */
+ set_page_stable_node(page, NULL);
+ remove_node_from_stable_tree(stable_node);
+ err = 0;
+ }
+
+ unlock_page(page);
+ put_page(page);
+ return err;
+}
+
+static int remove_all_stable_nodes(void)
+{
+ struct stable_node *stable_node;
+ struct list_head *this, *next;
+ int nid;
+ int err = 0;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ while (root_stable_tree[nid].rb_node) {
+ stable_node = rb_entry(root_stable_tree[nid].rb_node,
+ struct stable_node, node);
+ if (remove_stable_node(stable_node)) {
+ err = -EBUSY;
+ break; /* proceed to next nid */
+ }
+ cond_resched();
+ }
+ }
+ list_for_each_safe(this, next, &migrate_nodes) {
+ stable_node = list_entry(this, struct stable_node, list);
+ if (remove_stable_node(stable_node))
+ err = -EBUSY;
+ cond_resched();
+ }
+ return err;
+}
+
+static int unmerge_and_remove_all_rmap_items(void)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
+ struct mm_slot, mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ for (mm_slot = ksm_scan.mm_slot;
+ mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (ksm_test_exit(mm))
+ break;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ continue;
+ err = unmerge_ksm_pages(vma,
+ vma->vm_start, vma->vm_end);
+ if (err)
+ goto error;
+ }
+
+ remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_test_exit(mm)) {
+ hash_del(&mm_slot->link);
+ list_del(&mm_slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ free_mm_slot(mm_slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ up_read(&mm->mmap_sem);
+ mmdrop(mm);
+ } else {
+ spin_unlock(&ksm_mmlist_lock);
+ up_read(&mm->mmap_sem);
+ }
+ }
+
+ /* Clean up stable nodes, but don't worry if some are still busy */
+ remove_all_stable_nodes();
+ ksm_scan.seqnr = 0;
+ return 0;
+
+error:
+ up_read(&mm->mmap_sem);
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = &ksm_mm_head;
+ spin_unlock(&ksm_mmlist_lock);
+ return err;
+}
+#endif /* CONFIG_SYSFS */
+
+static u32 calc_checksum(struct page *page)
+{
+ u32 checksum;
+ void *addr = kmap_atomic(page);
+ checksum = jhash2(addr, PAGE_SIZE / 4, 17);
+ kunmap_atomic(addr);
+ return checksum;
+}
+
+static int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+ return !memcmp_pages(page1, page2);
+}
+
+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
+ pte_t *orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ int swapped;
+ int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ BUG_ON(PageTransCompound(page));
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
+ if (!ptep)
+ goto out_mn;
+
+ if (pte_write(*ptep) || pte_dirty(*ptep)) {
+ pte_t entry;
+
+ swapped = PageSwapCache(page);
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ /*
+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
+ * take any lock, therefore the check that we are going to make
+ * with the pagecount against the mapcount is racey and
+ * O_DIRECT can happen right after the check.
+ * So we clear the pte and flush the tlb before the check
+ * this assure us that no O_DIRECT can happen after the check
+ * or in the middle of the check.
+ */
+ entry = ptep_clear_flush(vma, addr, ptep);
+ /*
+ * Check that no O_DIRECT or similar I/O is in progress on the
+ * page
+ */
+ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+ set_pte_at(mm, addr, ptep, entry);
+ goto out_unlock;
+ }
+ if (pte_dirty(entry))
+ set_page_dirty(page);
+ entry = pte_mkclean(pte_wrprotect(entry));
+ set_pte_at_notify(mm, addr, ptep, entry);
+ }
+ *orig_pte = *ptep;
+ err = 0;
+
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+ return err;
+}
+
+/**
+ * replace_page - replace page in vma by new ksm page
+ * @vma: vma that holds the pte pointing to page
+ * @page: the page we are replacing by kpage
+ * @kpage: the ksm page we replace page by
+ * @orig_pte: the original value of the pte
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int replace_page(struct vm_area_struct *vma, struct page *page,
+ struct page *kpage, pte_t orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+ BUG_ON(pmd_trans_huge(*pmd));
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte_same(*ptep, orig_pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out_mn;
+ }
+
+ get_page(kpage);
+ page_add_anon_rmap(kpage, vma, addr);
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+ page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ put_page(page);
+
+ pte_unmap_unlock(ptep, ptl);
+ err = 0;
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+ return err;
+}
+
+static int page_trans_compound_anon_split(struct page *page)
+{
+ int ret = 0;
+ struct page *transhuge_head = page_trans_compound_anon(page);
+ if (transhuge_head) {
+ /* Get the reference on the head to split it. */
+ if (get_page_unless_zero(transhuge_head)) {
+ /*
+ * Recheck we got the reference while the head
+ * was still anonymous.
+ */
+ if (PageAnon(transhuge_head))
+ ret = split_huge_page(transhuge_head);
+ else
+ /*
+ * Retry later if split_huge_page run
+ * from under us.
+ */
+ ret = 1;
+ put_page(transhuge_head);
+ } else
+ /* Retry later if split_huge_page run from under us. */
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * try_to_merge_one_page - take two pages and merge them into one
+ * @vma: the vma that holds the pte pointing to page
+ * @page: the PageAnon page that we want to replace with kpage
+ * @kpage: the PageKsm page that we want to map instead of page,
+ * or NULL the first time when we want to use page as kpage.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_one_page(struct vm_area_struct *vma,
+ struct page *page, struct page *kpage)
+{
+ pte_t orig_pte = __pte(0);
+ int err = -EFAULT;
+
+ if (page == kpage) /* ksm page forked */
+ return 0;
+
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ goto out;
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+ goto out;
+ BUG_ON(PageTransCompound(page));
+ if (!PageAnon(page))
+ goto out;
+
+ /*
+ * We need the page lock to read a stable PageSwapCache in
+ * write_protect_page(). We use trylock_page() instead of
+ * lock_page() because we don't want to wait here - we
+ * prefer to continue scanning and merging different pages,
+ * then come back to this page when it is unlocked.
+ */
+ if (!trylock_page(page))
+ goto out;
+ /*
+ * If this anonymous page is mapped only here, its pte may need
+ * to be write-protected. If it's mapped elsewhere, all of its
+ * ptes are necessarily already write-protected. But in either
+ * case, we need to lock and check page_count is not raised.
+ */
+ if (write_protect_page(vma, page, &orig_pte) == 0) {
+ if (!kpage) {
+ /*
+ * While we hold page lock, upgrade page from
+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
+ * stable_tree_insert() will update stable_node.
+ */
+ set_page_stable_node(page, NULL);
+ mark_page_accessed(page);
+ err = 0;
+ } else if (pages_identical(page, kpage))
+ err = replace_page(vma, page, kpage, orig_pte);
+ }
+
+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
+ munlock_vma_page(page);
+ if (!PageMlocked(kpage)) {
+ unlock_page(page);
+ lock_page(kpage);
+ mlock_vma_page(kpage);
+ page = kpage; /* for final unlock */
+ }
+ }
+
+ unlock_page(page);
+out:
+ return err;
+}
+
+/*
+ * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
+ * but no new kernel page is allocated: kpage must already be a ksm page.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
+ struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ struct vm_area_struct *vma;
+ int err = -EFAULT;
+
+ down_read(&mm->mmap_sem);
+ if (ksm_test_exit(mm))
+ goto out;
+ vma = find_vma(mm, rmap_item->address);
+ if (!vma || vma->vm_start > rmap_item->address)
+ goto out;
+
+ err = try_to_merge_one_page(vma, page, kpage);
+ if (err)
+ goto out;
+
+ /* Unstable nid is in union with stable anon_vma: remove first */
+ remove_rmap_item_from_tree(rmap_item);
+
+ /* Must get reference to anon_vma while still holding mmap_sem */
+ rmap_item->anon_vma = vma->anon_vma;
+ get_anon_vma(vma->anon_vma);
+out:
+ up_read(&mm->mmap_sem);
+ return err;
+}
+
+/*
+ * try_to_merge_two_pages - take two identical pages and prepare them
+ * to be merged into one page.
+ *
+ * This function returns the kpage if we successfully merged two identical
+ * pages into one ksm page, NULL otherwise.
+ *
+ * Note that this function upgrades page to ksm page: if one of the pages
+ * is already a ksm page, try_to_merge_with_ksm_page should be used.
+ */
+static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
+ struct page *page,
+ struct rmap_item *tree_rmap_item,
+ struct page *tree_page)
+{
+ int err;
+
+ err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
+ if (!err) {
+ err = try_to_merge_with_ksm_page(tree_rmap_item,
+ tree_page, page);
+ /*
+ * If that fails, we have a ksm page with only one pte
+ * pointing to it: so break it.
+ */
+ if (err)
+ break_cow(rmap_item);
+ }
+ return err ? NULL : page;
+}
+
+/*
+ * stable_tree_search - search for page inside the stable tree
+ *
+ * This function checks if there is a page inside the stable tree
+ * with identical content to the page that we are scanning right now.
+ *
+ * This function returns the stable tree node of identical content if found,
+ * NULL otherwise.
+ */
+static struct page *stable_tree_search(struct page *page)
+{
+ int nid;
+ struct rb_root *root;
+ struct rb_node **new;
+ struct rb_node *parent;
+ struct stable_node *stable_node;
+ struct stable_node *page_node;
+
+ page_node = page_stable_node(page);
+ if (page_node && page_node->head != &migrate_nodes) {
+ /* ksm page forked */
+ get_page(page);
+ return page;
+ }
+
+ nid = get_kpfn_nid(page_to_pfn(page));
+ root = root_stable_tree + nid;
+again:
+ new = &root->rb_node;
+ parent = NULL;
+
+ while (*new) {
+ struct page *tree_page;
+ int ret;
+
+ cond_resched();
+ stable_node = rb_entry(*new, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node, false);
+ if (!tree_page)
+ return NULL;
+
+ ret = memcmp_pages(page, tree_page);
+ put_page(tree_page);
+
+ parent = *new;
+ if (ret < 0)
+ new = &parent->rb_left;
+ else if (ret > 0)
+ new = &parent->rb_right;
+ else {
+ /*
+ * Lock and unlock the stable_node's page (which
+ * might already have been migrated) so that page
+ * migration is sure to notice its raised count.
+ * It would be more elegant to return stable_node
+ * than kpage, but that involves more changes.
+ */
+ tree_page = get_ksm_page(stable_node, true);
+ if (tree_page) {
+ unlock_page(tree_page);
+ if (get_kpfn_nid(stable_node->kpfn) !=
+ NUMA(stable_node->nid)) {
+ put_page(tree_page);
+ goto replace;
+ }
+ return tree_page;
+ }
+ /*
+ * There is now a place for page_node, but the tree may
+ * have been rebalanced, so re-evaluate parent and new.
+ */
+ if (page_node)
+ goto again;
+ return NULL;
+ }
+ }
+
+ if (!page_node)
+ return NULL;
+
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_link_node(&page_node->node, parent, new);
+ rb_insert_color(&page_node->node, root);
+ get_page(page);
+ return page;
+
+replace: