aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile19
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/kasan_init.c4
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mlock.c7
-rw-r--r--mm/mmap.c63
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/page_alloc.c107
-rw-r--r--mm/shmem.c13
-rw-r--r--mm/slab.c30
-rw-r--r--mm/slub.c140
-rw-r--r--mm/swapfile.c42
-rw-r--r--mm/usercopy.c278
-rw-r--r--mm/vmscan.c43
-rw-r--r--mm/vmstat.c73
-rw-r--r--mm/zbud.c3
-rw-r--r--mm/zpool.c6
-rw-r--r--mm/zsmalloc.c1221
-rw-r--r--mm/zswap.c5
21 files changed, 1707 insertions, 367 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b078fd..95c5728d65bc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -601,6 +601,16 @@ config PGTABLE_MAPPING
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about whats happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
config GENERIC_EARLY_IOREMAP
bool
diff --git a/mm/Makefile b/mm/Makefile
index 3614adbf6d32..4e5306ee3271 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,24 @@
KASAN_SANITIZE_slab_common.o := n
KASAN_SANITIZE_slub.o := n
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
+# Since __builtin_frame_address does work as used, disable the warning.
+CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -73,3 +91,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 64710148941e..131daadf40e4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,6 @@
KASAN_SANITIZE := n
+UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index f41ac9a551cc..3f9a41cf0ac6 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -29,10 +29,10 @@
*/
unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
#endif
pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0938b30da4ab..dd7d354c5bfa 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -102,7 +102,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6a23ccc46f0f..e8b241972568 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -770,7 +770,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
- new_pol);
+ new_pol, vma_get_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index abf1f0f51eda..8aa350f950e6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -26,10 +26,10 @@
int can_do_mlock(void)
{
- if (capable(CAP_IPC_LOCK))
- return 1;
if (rlimit(RLIMIT_MEMLOCK) != 0)
return 1;
+ if (capable(CAP_IPC_LOCK))
+ return 1;
return 0;
}
EXPORT_SYMBOL(can_do_mlock);
@@ -567,7 +567,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 954ad5a3c140..496e6d221ad8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -57,6 +57,18 @@
#define arch_rebalance_pgtables(addr, len) (addr)
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
+const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
+const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
+int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
+#endif
+
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -946,7 +958,8 @@ again: remove_next = 1 + (end > next->vm_end);
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags)
+ struct file *file, unsigned long vm_flags,
+ const char __user *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -962,6 +975,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
+ if (vma_get_anon_name(vma) != anon_name)
+ return 0;
return 1;
}
@@ -992,9 +1007,10 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -1011,9 +1027,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1024,9 +1041,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1055,8 +1072,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t pgoff, struct mempolicy *policy,
+ const char __user *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1081,16 +1099,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
- can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff)) {
+ mpol_equal(vma_policy(prev), policy) &&
+ can_vma_merge_after(prev, vm_flags, anon_vma,
+ file, pgoff, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen) &&
+ can_vma_merge_before(next, vm_flags, anon_vma,
+ file, pgoff+pglen, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1109,9 +1127,9 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen)) {
+ mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags, anon_vma,
+ file, pgoff+pglen, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
@@ -1598,7 +1616,8 @@ munmap_back:
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+ NULL, NULL);
if (vma)
goto out;
@@ -2638,6 +2657,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
+EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long start, size_t len)
{
@@ -2723,7 +2743,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL);
+ NULL, NULL, pgoff, NULL, NULL);
if (vma)
goto out;
@@ -2882,7 +2902,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace93454ce8e..5911e603269d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -289,7 +289,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (*pprev) {
vma = *pprev;
goto success;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e6f7878a2d6..be880ae2f633 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,8 +205,21 @@ static char * const zone_names[MAX_NR_ZONES] = {
"Movable",
};
+/*
+ * Try to keep at least this much lowmem free. Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int min_free_order_shift = 1;
+
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -1089,39 +1102,34 @@ static void change_pageblock_range(struct page *pageblock_page,
}
/*
- * If breaking a large block of pages, move all free pages to the preferred
- * allocation list. If falling back for a reclaimable kernel allocation, be
- * more aggressive about taking ownership of free pages.
+ * When we are falling back to another migratetype during allocation, try to
+ * steal extra free pages from the same pageblocks to satisfy further
+ * allocations, instead of polluting multiple pageblocks.
*
- * On the other hand, never change migration type of MIGRATE_CMA pageblocks
- * nor move CMA pages to different free lists. We don't want unmovable pages
- * to be allocated from MIGRATE_CMA areas.
+ * If we are stealing a relatively large buddy page, it is likely there will
+ * be more free pages in the pageblock, so try to steal them all. For
+ * reclaimable and unmovable allocations, we steal regardless of page size,
+ * as fragmentation caused by those allocations polluting movable pageblocks
+ * is worse than movable allocations stealing from unmovable and reclaimable
+ * pageblocks.
*
- * Returns the allocation migratetype if free pages were stolen, or the
- * fallback migratetype if it was decided not to steal.
+ * If we claim more than half of the pageblock, change pageblock's migratetype
+ * as well.
*/
-static int try_to_steal_freepages(struct zone *zone, struct page *page,
+static void try_to_steal_freepages(struct zone *zone, struct page *page,
int start_type, int fallback_type)
{
unsigned int current_order = page_order(page);
- /*
- * When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself. We also ensure the freepage_migratetype
- * is set to CMA so it is returned to the correct freelist in case
- * the page ends up being not actually allocated from the pcp lists.
- */
- if (is_migrate_cma(fallback_type))
- return fallback_type;
-
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
- return start_type;
+ return;
}
if (current_order >= pageblock_order / 2 ||
start_type == MIGRATE_RECLAIMABLE ||
+ start_type == MIGRATE_UNMOVABLE ||
page_group_by_mobility_disabled) {
int pages;
@@ -1131,11 +1139,7 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
if (pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
-
- return start_type;
}
-
- return fallback_type;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1145,14 +1149,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
- int migratetype, new_type, i;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
+ int i;
for (i = 0;; i++) {
- migratetype = fallbacks[start_migratetype][i];
+ int migratetype = fallbacks[start_migratetype][i];
+ int buddy_type = start_migratetype;
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
@@ -1166,22 +1171,36 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct page, lru);
area->nr_free--;
- new_type = try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
+ if (!is_migrate_cma(migratetype)) {
+ try_to_steal_freepages(zone, page,
+ start_migratetype,
+ migratetype);
+ } else {
+ /*
+ * When borrowing from MIGRATE_CMA, we need to
+ * release the excess buddy pages to CMA
+ * itself, and we do not try to steal extra
+ * free pages.
+ */
+ buddy_type = migratetype;
+ }
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
expand(zone, page, order, current_order, area,
- new_type);
- /* The freepage_migratetype may differ from pageblock's
+ buddy_type);
+
+ /*
+ * The freepage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages. This is OK as long as it does
- * not differ for MIGRATE_CMA type.
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
*/
- set_freepage_migratetype(page, new_type);
+ set_freepage_migratetype(page, buddy_type);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
@@ -1750,7 +1769,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
- min >>= 1;
+ min >>= min_free_order_shift;
if (free_pages <= min)
return false;
@@ -5681,6 +5700,7 @@ static void setup_per_zone_lowmem_reserve(void)
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
@@ -5692,11 +5712,14 @@ static void __setup_per_zone_wmarks(void)
}
for_each_zone(zone) {
- u64 tmp;
+ u64 min, low;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
- do_div(tmp, lowmem_pages);
+ min = (u64)pages_min * zone->managed_pages;
+ do_div(min, lowmem_pages);
+ low = (u64)pages_low * zone->managed_pages;
+ do_div(low, vm_total_pages);
+
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -5717,11 +5740,13 @@ static void __setup_per_zone_wmarks(void)
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = tmp;
+ zone->watermark[WMARK_MIN] = min;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
+ low + (min >> 2);
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+ low + (min >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -5845,7 +5870,7 @@ module_init(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
- * changes.
+ * or extra_free_kbytes changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
diff --git a/mm/shmem.c b/mm/shmem.c
index fac22b578eb9..b159434edcaa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3406,6 +3406,14 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+}
+
/**
* shmem_zero_setup - setup a shared anonymous mapping
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -3419,10 +3427,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
if (IS_ERR(file))
return PTR_ERR(file);
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
+ shmem_set_file(vma, file);
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index b7f9f6456a61..918ffe2dbb08 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4214,6 +4214,36 @@ static int __init slab_proc_init(void)
module_init(slab_proc_init);
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+ struct page *page)
+{
+ struct kmem_cache *cachep;
+ unsigned int objnr;
+ unsigned long offset;
+
+ /* Find and validate object. */
+ cachep = page->slab_cache;
+ objnr = obj_to_index(cachep, page, (void *)ptr);
+ BUG_ON(objnr >= cachep->num);
+
+ /* Find offset within object. */
+ offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+
+ /* Allow address range falling entirely within object size. */
+ if (offset <= cachep->object_size && n <= cachep->object_size - offset)
+ return NULL;
+
+ return cachep->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
/**
* ksize - get the actual amount of memory allocated for a given object
* @objp: Pointer to the object
diff --git a/mm/slub.c b/mm/slub.c
index 1256a8f603a2..8bc50c0ca2b5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ p += s->red_left_pad;
+
+ return p;
+}
+
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, const void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + page->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
/* Loop over all objects in a slab */
#define for_each_object(__p, __s, __addr, __objects) \
- for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
- __p += (__s)->size)
+ for (__p = fixup_red_left(__s, __addr); \
+ __p < (__addr) + (__objects) * (__s)->size; \
+ __p += (__s)->size)
#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = (__addr), __idx = 1; __idx <= __objects;\
- __p += (__s)->size, __idx++)
+ for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+ __idx <= __objects; \
+ __p += (__s)->size, __idx++)
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
set_bit(slab_index(p, s, addr), map);
}
+static inline int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
/*
* Debug settings:
*/
@@ -489,6 +497,26 @@ static inline void metadata_access_disable(void)
/*
* Object debugging
*/
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
static void print_section(char *text, u8 *addr, unsigned int length)
{
metadata_access_enable();
@@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
- if (p > addr + 16)
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ else if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- if (off != s->size)
+ if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, s->size - off);
+ print_section("Padding ", p + off, size_from_object(s) - off);
dump_stack();
}
@@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
@@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */
off += 2 * sizeof(struct track);
- if (s->size == off)
+ if (size_from_object(s) == off)
return 1;
return check_bytes_and_report(s, page, p, "Object padding",
- p + off, POISON_INUSE, s->size - off);
+ p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
@@ -818,6 +851,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -1441,7 +1478,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- page->freelist = start;
+ page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
out:
@@ -3044,7 +3081,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size += 2 * sizeof(struct track);
- if (flags & SLAB_RED_ZONE)
+ if (flags & SLAB_RED_ZONE) {
/*
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
@@ -3053,6 +3090,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* of the object.
*/
size += sizeof(void *);
+
+ s->red_left_pad = sizeof(void *);
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
#endif
/*
@@ -3363,6 +3405,46 @@ static size_t __ksize(const void *object)
return slab_ksize(page->slab_cache);
}
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+ struct page *page)
+{
+ struct kmem_cache *s;
+ unsigned long offset;
+ size_t object_size;
+
+ /* Find object and usable object size. */
+ s = page->slab_cache;
+ object_size = slab_ksize(s);
+
+ /* Reject impossible pointers. */
+ if (ptr < page_address(page))
+ return s->name;
+
+ /* Find offset within object. */
+ offset = (ptr - page_address(page)) % s->size;
+
+ /* Adjust for redzone and reject if within the redzone. */
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+ if (offset < s->red_left_pad)
+ return s->name;
+ offset -= s->red_left_pad;
+ }
+
+ /* Allow address range falling entirely within object size. */
+ if (offset <= object_size && n <= object_size - offset)
+ return NULL;
+
+ return s->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
size_t ksize(const void *object)
{
size_t size = __ksize(object);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14ae9e3ec728..38b66a9823b4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@ int page_swapcount(struct page *page)
}
/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+ int count, tmp_count, n;
+ struct swap_info_struct *p;
+ struct page *page;
+ pgoff_t offset;
+ unsigned char *map;
+
+ p = swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ count = swap_count(p->swap_map[swp_offset(entry)]);
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ count &= ~COUNT_CONTINUED;
+ n = SWAP_MAP_MAX + 1;
+
+ offset = swp_offset(entry);
+ page = vmalloc_to_page(p->swap_map + offset);
+ offset &= ~PAGE_MASK;
+ VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+ do {
+ page = list_entry(page->lru.next, struct page, lru);
+ map = kmap_atomic(page);
+ tmp_count = map[offset];
+ kunmap_atomic(map);
+
+ count += (tmp_count & ~COUNT_CONTINUED) * n;
+ n *= (SWAP_CONT_MAX + 1);
+ } while (tmp_count & COUNT_CONTINUED);
+out:
+ spin_unlock(&p->lock);
+ return count;
+}
+
+/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
diff --git a/mm/usercopy.c b/mm/usercopy.c
new file mode 100644
index 000000000000..b34996a3860b
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,278 @@
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+
+enum {
+ BAD_STACK = -1,
+ NOT_STACK = 0,
+ GOOD_FRAME,
+ GOOD_STACK,
+};
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ * Returns:
+ * NOT_STACK: not at all on the stack
+ * GOOD_FRAME: fully within a valid stack frame
+ * GOOD_STACK: fully on the stack (when can't do frame-checking)
+ * BAD_STACK: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+ const void * const stack = task_stack_page(current);
+ const void * const stackend = stack + THREAD_SIZE;
+ int ret;
+
+ /* Object is not on the stack at all. */
+ if (obj + len <= stack || stackend <= obj)
+ return NOT_STACK;
+
+ /*
+ * Reject: object partially overlaps the stack (passing the
+ * the check above means at least one end is within the stack,
+ * so if this check fails, the other end is outside the stack).
+ */
+ if (obj < stack || stackend < obj + len)
+ return BAD_STACK;
+
+ /* Check if object is safely within a valid frame. */
+ ret = arch_within_stack_frames(stack, stackend, obj, len);
+ if (ret)
+ return ret;
+
+ return GOOD_STACK;
+}
+
+static void report_usercopy(const void *ptr, unsigned long len,
+ bool to_user, const char *type)
+{
+ pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
+ to_user ? "exposure" : "overwrite",
+ to_user ? "from" : "to", ptr, type ? : "unknown", len);
+ /*
+ * For greater effect, it would be nice to do do_group_exit(),
+ * but BUG() actually hooks all the lock-breaking and per-arch
+ * Oops code, so that is used here instead.
+ */
+ BUG();
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
+ unsigned long high)
+{
+ unsigned long check_low = (uintptr_t)ptr;
+ unsigned long check_high = check_low + n;
+
+ /* Does not overlap if entirely above or entirely below. */
+ if (check_low >= high || check_high <= low)
+ return false;
+
+ return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline const char *check_kernel_text_object(const void *ptr,
+ unsigned long n)
+{
+ unsigned long textlow = (unsigned long)_stext;
+ unsigned long texthigh = (unsigned long)_etext;
+ unsigned long textlow_linear, texthigh_linear;
+
+ if (overlaps(ptr, n, textlow, texthigh))
+ return "<kernel text>";
+
+ /*
+ * Some architectures have virtual memory mappings with a secondary
+ * mapping of the kernel text, i.e. there is more than one virtual
+ * kernel address that points to the kernel image. It is usually
+ * when there is a separate linear physical memory mapping, in that
+ * __pa() is not just the reverse of __va(). This can be detected
+ * and checked:
+ */
+ textlow_linear = (unsigned long)__va(__pa(textlow));
+ /* No different mapping: we're done. */
+ if (textlow_linear == textlow)
+ return NULL;
+
+ /* Check the secondary mapping... */
+ texthigh_linear = (unsigned long)__va(__pa(texthigh));
+ if (overlaps(ptr, n, textlow_linear, texthigh_linear))
+ return "<linear kernel text>";
+
+ return NULL;
+}
+
+static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+{
+ /* Reject if object wraps past end of memory. */
+ if ((unsigned long)ptr + n < (unsigned long)ptr)
+ return "<wrapped address>";
+
+ /* Reject if NULL or ZERO-allocation. */
+ if (ZERO_OR_NULL_PTR(ptr))
+ return "<null>";
+
+ return NULL;
+}
+
+/* Checks for allocs that are marked in some way as spanning multiple pages. */
+static inline const char *check_page_span(const void *ptr, unsigned long n,
+ struct page *page, bool to_user)
+{
+#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
+ const void *end = ptr + n - 1;
+ struct page *endpage;
+ bool is_reserved, is_cma;
+
+ /*
+ * Sometimes the kernel data regions are not marked Reserved (see
+ * check below). And sometimes [_sdata,_edata) does not cover
+ * rodata and/or bss, so check each range explicitly.
+ */
+
+ /* Allow reads of kernel rodata region (if not marked as Reserved). */
+ if (ptr >= (const void *)__start_rodata &&
+ end <= (const void *)__end_rodata) {
+ if (!to_user)
+ return "<rodata>";
+ return NULL;
+ }
+
+ /* Allow kernel data region (if not marked as Reserved). */
+ if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+ return NULL;
+
+ /* Allow kernel bss region (if not marked as Reserved). */
+ if (ptr >= (const void *)__bss_start &&
+ end <= (const void *)__bss_stop)
+ return NULL;
+
+ /* Is the object wholly within one base page? */
+ if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+ ((unsigned long)end & (unsigned long)PAGE_MASK)))
+ return NULL;
+
+ /* Allow if fully inside the same compound (__GFP_COMP) page. */
+ endpage = virt_to_head_page(end);
+ if (likely(endpage == page))
+ return NULL;
+
+ /*
+ * Reject if range is entirely either Reserved (i.e. special or
+ * device memory), or CMA. Otherwise, reject since the object spans
+ * several independently allocated pages.
+ */
+ is_reserved = PageReserved(page);
+ is_cma = is_migrate_cma_page(page);
+ if (!is_reserved && !is_cma)
+ return "<spans multiple pages>";
+
+ for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
+ page = virt_to_head_page(ptr);
+ if (is_reserved && !PageReserved(page))
+ return "<spans Reserved and non-Reserved pages>";
+ if (is_cma && !is_migrate_cma_page(page))
+ return "<spans CMA and non-CMA pages>";
+ }
+#endif
+
+ return NULL;
+}
+
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
+{
+ struct page *page;
+
+ /*
+ * Some architectures (arm64) return true for virt_addr_valid() on
+ * vmalloced addresses. Work around this by checking for vmalloc
+ * first.
+ */
+ if (is_vmalloc_addr(ptr))
+ return NULL;
+
+ if (!virt_addr_valid(ptr))
+ return NULL;
+
+ page = virt_to_head_page(ptr);
+
+ /* Check slab allocator for flags and size. */
+ if (PageSlab(page))
+ return __check_heap_object(ptr, n, page);
+
+ /* Verify object does not incorrectly span multiple pages. */
+ return check_page_span(ptr, n, page, to_user);
+}
+
+/*
+ * Validates that the given object is:
+ * - not bogus address
+ * - known-safe heap or stack object
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+ const char *err;
+
+ /* Skip all tests if size is zero. */
+ if (!n)
+ return;
+
+ /* Check for invalid addresses. */
+ err = check_bogus_address(ptr, n);
+ if (err)
+ goto report;
+
+ /* Check for bad heap object. */
+ err = check_heap_object(ptr, n, to_user);
+ if (err)
+ goto report;
+
+ /* Check for bad stack object. */
+ switch (check_stack_object(ptr, n)) {
+ case NOT_STACK:
+ /* Object is not touching the current process stack. */
+ break;
+ case GOOD_FRAME:
+ case GOOD_STACK:
+ /*
+ * Object is either in the correct frame (when it
+ * is possible to check) or just generally on the
+ * process stack (when frame checking not available).
+ */
+ return;
+ default:
+ err = "<process stack>";
+ goto report;
+ }
+
+ /* Check for object in kernel to avoid text exposure. */
+ err = check_kernel_text_object(ptr, n);
+ if (!err)
+ return;
+
+report:
+ report_usercopy(ptr, n, to_user, err);
+}
+EXPORT_SYMBOL(__check_object_size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d48b28219edf..4b2a225d92f9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,6 +46,7 @@
#include <linux/oom.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -186,6 +187,39 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
}
+struct dentry *debug_file;
+
+static int debug_shrinker_show(struct seq_file *s, void *unused)
+{
+ struct shrinker *shrinker;
+ struct shrink_control sc;
+
+ sc.gfp_mask = -1;
+ sc.nr_to_scan = 0;
+
+ down_read(&shrinker_rwsem);
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ int num_objs;
+
+ num_objs = shrinker->count_objects(shrinker, &sc);
+ seq_printf(s, "%pf %d\n", shrinker->scan_objects, num_objs);
+ }
+ up_read(&shrinker_rwsem);
+ return 0;
+}
+
+static int debug_shrinker_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, debug_shrinker_show, inode->i_private);
+}
+
+static const struct file_operations debug_shrinker_fops = {
+ .open = debug_shrinker_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/*
* Add a shrinker callback to be called from the vm.
*/
@@ -215,6 +249,15 @@ int register_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(register_shrinker);
+static int __init add_shrinker_debug(void)
+{
+ debugfs_create_file("shrinker", 0644, NULL, NULL,
+ &debug_shrinker_fops);
+ return 0;
+}
+
+late_initcall(add_shrinker_debug);
+
/*
* Remove one
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4590aa42b6cd..50e673be0289 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -455,7 +455,7 @@ static int fold_diff(int *diff)
*
* The function returns the number of global counters updated.
*/
-static int refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(bool do_pagesets)
{
struct zone *zone;
int i;
@@ -479,33 +479,35 @@ static int refresh_cpu_vm_stats(void)
#endif
}
}
- cond_resched();
#ifdef CONFIG_NUMA
- /*
- * Deal with draining the remote pageset of this
- * processor
- *
- * Check if there are pages remaining in this pageset
- * if not then there is nothing to expire.
- */
- if (!__this_cpu_read(p->expire) ||
+ if (do_pagesets) {
+ cond_resched();
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!__this_cpu_read(p->expire) ||
!__this_cpu_read(p->pcp.count))
- continue;
-
- /*
- * We never drain zones local to this processor.
- */
- if (zone_to_nid(zone) == numa_node_id()) {
- __this_cpu_write(p->expire, 0);
- continue;
- }
+ continue;
+
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ __this_cpu_write(p->expire, 0);
+ continue;
+ }
- if (__this_cpu_dec_return(p->expire))
- continue;
+ if (__this_cpu_dec_return(p->expire))
+ continue;
- if (__this_cpu_read(p->pcp.count)) {
- drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
- changes++;
+ if (__this_cpu_read(p->pcp.count)) {
+ drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ changes++;
+ }
}
#endif
}
@@ -1259,7 +1261,7 @@ static cpumask_var_t cpu_stat_off;
static void vmstat_update(struct work_struct *w)
{
- if (refresh_cpu_vm_stats())
+ if (refresh_cpu_vm_stats(true)) {
/*
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
@@ -1267,7 +1269,7 @@ static void vmstat_update(struct work_struct *w)
*/
schedule_delayed_work(this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
- else {
+ } else {
/*
* We did not update any counters so the app may be in
* a mode where it does not cause counter updates.
@@ -1290,6 +1292,23 @@ static void vmstat_update(struct work_struct *w)
}
/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ do {
+ if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ cancel_delayed_work(this_cpu_ptr(&vmstat_work));
+
+ } while (refresh_cpu_vm_stats(false));
+}
+
+/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
*/
@@ -1321,7 +1340,7 @@ static bool need_update(int cpu)
*/
static void vmstat_shepherd(struct work_struct *w);
-static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
static void vmstat_shepherd(struct work_struct *w)
{
diff --git a/mm/zbud.c b/mm/zbud.c
index ecf1dbef6983..2010b6ecf5ce 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+ struct zpool_ops *zpool_ops)
{
return zbud_create_pool(gfp, &zbud_zpool_ops);
}
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_create_pool() - Create a new zpool
* @type The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name The name of the zpool (e.g. zram0, zswap)
* @gfp The GFP flags to use when allocating the pool.
* @ops The optional ops callback.
*
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+ struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
zpool->type = driver->type;
zpool->driver = driver;
- zpool->pool = driver->create(gfp, ops);
+ zpool->pool = driver->create(name, gfp, ops);
zpool->ops = ops;
if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 839a48c3ca27..a8b5e749e84e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
*/
/*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
* Following is how we use various fields and flags of underlying
* struct page(s) to form a zspage.
*
@@ -57,6 +28,8 @@
*
* page->private (union with page->first_page): refers to the
* component page after the first page
+ * If the page is first_page for huge object, it stores handle.
+ * Look at size_class->huge.
* page->freelist: points to the first free object in zspage.
* Free objects are linked together using in-place
* metadata.
@@ -78,6 +51,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -91,6 +65,7 @@
#include <linux/hardirq.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
@@ -109,6 +84,8 @@
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -132,13 +109,33 @@
#endif
#endif
#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT 0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
/*
@@ -155,8 +152,6 @@
* (reason above)
*/
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
-#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
- ZS_SIZE_CLASS_DELTA + 1)
/*
* We do not maintain any list for completely empty or full pages
@@ -170,6 +165,29 @@ enum fullness_group {
ZS_FULL
};
+enum zs_stat_type {
+ OBJ_ALLOCATED,
+ OBJ_USED,
+ CLASS_ALMOST_FULL,
+ CLASS_ALMOST_EMPTY,
+ NR_ZS_STAT_TYPE,
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static struct dentry *zs_stat_root;
+
+struct zs_size_stat {
+ unsigned long objs[NR_ZS_STAT_TYPE];
+};
+
+#endif
+
+/*
+ * number of size_classes
+ */
+static int zs_size_classes;
+
/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when:
* n <= N / f, where
@@ -196,6 +214,12 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct zs_size_stat stats;
+#endif
spinlock_t lock;
@@ -209,15 +233,31 @@ struct size_class {
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
- /* Handle of next free chunk (encodes <PFN, obj_idx>) */
- void *next;
+ union {
+ /*
+ * Position of next free chunk (encodes <PFN, obj_idx>)
+ * It's valid for non-allocated object
+ */
+ void *next;
+ /*
+ * Handle of allocated object.
+ */
+ unsigned long handle;
+ };
};
struct zs_pool {
- struct size_class size_class[ZS_SIZE_CLASSES];
+ char *name;
+
+ struct size_class **size_class;
+ struct kmem_cache *handle_cachep;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct dentry *stat_dentry;
+#endif
};
/*
@@ -237,15 +277,45 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
+ bool huge;
};
+static int create_handle_cache(struct zs_pool *pool)
+{
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ return pool->handle_cachep ? 0 : 1;
+}
+
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+ if (pool->handle_cachep)
+ kmem_cache_destroy(pool->handle_cachep);
+}
+
+static unsigned long alloc_handle(struct zs_pool *pool)
+{
+ return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+ pool->flags & ~__GFP_HIGHMEM);
+}
+
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+ kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+ *(unsigned long *)handle = obj;
+}
+
/* zpool driver */
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
{
- return zs_create_pool(gfp);
+ return zs_create_pool(name, gfp);
}
static void zs_zpool_destroy(void *pool)
@@ -316,6 +386,11 @@ static struct zpool_driver zs_zpool_driver = {
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+ return pages_per_zspage * PAGE_SIZE / size;
+}
+
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -366,9 +441,182 @@ static int get_size_class_index(int size)
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
- return idx;
+ return min(zs_size_classes - 1, idx);
+}
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
}
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long class_almost_full, class_almost_empty;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ "class", "size", "almost_full", "almost_empty",
+ "obj_allocated", "obj_used", "pages_used",
+ "pages_per_zspage");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+ class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ i, class->size, class_almost_full, class_almost_empty,
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage);
+
+ total_class_almost_full += class_almost_full;
+ total_class_almost_empty += class_almost_empty;
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ "Total", "", total_class_almost_full,
+ total_class_almost_empty, total_objs,
+ total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
+
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -389,7 +637,7 @@ static enum fullness_group get_fullness_group(struct page *page)
fg = ZS_EMPTY;
else if (inuse == max_objects)
fg = ZS_FULL;
- else if (inuse <= max_objects / fullness_threshold_frac)
+ else if (inuse <= 3 * max_objects / fullness_threshold_frac)
fg = ZS_ALMOST_EMPTY;
else
fg = ZS_ALMOST_FULL;
@@ -418,6 +666,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
list_add_tail(&page->lru, &(*head)->lru);
*head = page;
+ zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -443,6 +693,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
struct page, lru);
list_del_init(&page->lru);
+ zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -454,11 +706,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
* page from the freelist of the old fullness group to that of the new
* fullness group.
*/
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+static enum fullness_group fix_fullness_group(struct size_class *class,
struct page *page)
{
int class_idx;
- struct size_class *class;
enum fullness_group currfg, newfg;
BUG_ON(!is_first_page(page));
@@ -468,7 +719,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
if (newfg == currfg)
goto out;
- class = &pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@@ -482,7 +732,8 @@ out:
* to form a zspage for each size class. This is important
* to reduce wastage due to unusable space left at end of
* each zspage which is given as:
- * wastage = Zp - Zp % size_class
+ * wastage = Zp % class_size
+ * usage = Zp - wastage
* where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
*
* For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -541,35 +792,50 @@ static struct page *get_next_page(struct page *page)
/*
* Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
+ * We use the least bit of handle for tagging.
*/
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
{
- unsigned long handle;
+ unsigned long obj;
if (!page) {
BUG_ON(obj_idx);
return NULL;
}
- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
- handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+ obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj |= ((obj_idx) & OBJ_INDEX_MASK);
+ obj <<= OBJ_TAG_BITS;
- return (void *)handle;
+ return (void *)obj;
}
/*
* Decode <page, obj_idx> pair from the given object handle. We adjust the
* decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
*/
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
unsigned long *obj_idx)
{
- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
- *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+
+static unsigned long handle_to_obj(unsigned long handle)
+{
+ return *(unsigned long *)handle;
+}
+
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+ void *obj)
+{
+ if (class->huge) {
+ VM_BUG_ON(!is_first_page(page));
+ return *(unsigned long *)page_private(page);
+ } else
+ return *(unsigned long *)obj;
}
static unsigned long obj_idx_to_offset(struct page *page,
@@ -583,6 +849,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
return off + obj_idx * class_size;
}
+static inline int trypin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+
+static void pin_tag(unsigned long handle)
+{
+ while (!trypin_tag(handle));
+}
+
+static void unpin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
+
static void reset_page(struct page *page)
{
clear_bit(PG_private, &page->flags);
@@ -629,6 +914,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
struct page *next_page;
struct link_free *link;
unsigned int i = 1;
+ void *vaddr;
/*
* page->index stores offset of first object starting
@@ -639,11 +925,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
if (page != first_page)
page->index = off;
- link = (struct link_free *)kmap_atomic(page) +
- off / sizeof(*link);
+ vaddr = kmap_atomic(page);
+ link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
- link->next = obj_location_to_handle(page, i++);
+ link->next = location_to_obj(page, i++);
link += class->size / sizeof(*link);
}
@@ -653,8 +939,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = obj_location_to_handle(next_page, 0);
- kunmap_atomic(link);
+ link->next = location_to_obj(next_page, 0);
+ kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
@@ -707,7 +993,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
init_zspage(first_page, class);
- first_page->freelist = obj_location_to_handle(first_page, 0);
+ first_page->freelist = location_to_obj(first_page, 0);
/* Maximum number of objects we can store in this zspage */
first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
@@ -784,7 +1070,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm_buf)
return 0;
- area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+ area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
if (!area->vm_buf)
return -ENOMEM;
return 0;
@@ -792,8 +1078,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
static inline void __zs_cpu_down(struct mapping_area *area)
{
- if (area->vm_buf)
- free_page((unsigned long)area->vm_buf);
+ kfree(area->vm_buf);
area->vm_buf = NULL;
}
@@ -830,12 +1115,19 @@ static void __zs_unmap_object(struct mapping_area *area,
{
int sizes[2];
void *addr;
- char *buf = area->vm_buf;
+ char *buf;
/* no write fastpath */
if (area->vm_mm == ZS_MM_RO)
goto out;
+ buf = area->vm_buf;
+ if (!area->huge) {
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
+ }
+
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -881,13 +1173,26 @@ static struct notifier_block zs_cpu_nb = {
.notifier_call = zs_cpu_notifier
};
-static void zs_exit(void)
+static int zs_register_cpu_notifier(void)
{
- int cpu;
+ int cpu, uninitialized_var(ret);
-#ifdef CONFIG_ZPOOL
- zpool_unregister_driver(&zs_zpool_driver);
-#endif
+ cpu_notifier_register_begin();
+
+ __register_cpu_notifier(&zs_cpu_nb);
+ for_each_online_cpu(cpu) {
+ ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ if (notifier_to_errno(ret))
+ break;
+ }
+
+ cpu_notifier_register_done();
+ return notifier_to_errno(ret);
+}
+
+static void zs_unregister_cpu_notifier(void)
+{
+ int cpu;
cpu_notifier_register_begin();
@@ -898,93 +1203,176 @@ static void zs_exit(void)
cpu_notifier_register_done();
}
-static int zs_init(void)
+static void init_zs_size_classes(void)
{
- int cpu, ret;
+ int nr;
- cpu_notifier_register_begin();
+ nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
+ if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
+ nr += 1;
- __register_cpu_notifier(&zs_cpu_nb);
- for_each_online_cpu(cpu) {
- ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- if (notifier_to_errno(ret)) {
- cpu_notifier_register_done();
- goto fail;
- }
- }
+ zs_size_classes = nr;
+}
- cpu_notifier_register_done();
+static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+{
+ if (prev->pages_per_zspage != pages_per_zspage)
+ return false;
-#ifdef CONFIG_ZPOOL
- zpool_register_driver(&zs_zpool_driver);
-#endif
+ if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
+ != get_maxobj_per_zspage(size, pages_per_zspage))
+ return false;
- return 0;
-fail:
- zs_exit();
- return notifier_to_errno(ret);
+ return true;
+}
+
+static bool zspage_full(struct page *page)
+{
+ BUG_ON(!is_first_page(page));
+
+ return page->inuse == page->objects;
+}
+
+unsigned long zs_get_total_pages(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->pages_allocated);
}
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
/**
- * zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
*
- * This function must be called before anything when using
- * the zsmalloc allocator.
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
*
- * On success, a pointer to the newly created pool is returned,
- * otherwise NULL.
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
*/
-struct zs_pool *zs_create_pool(gfp_t flags)
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+ enum zs_mapmode mm)
{
- int i, ovhd_size;
- struct zs_pool *pool;
+ struct page *page;
+ unsigned long obj, obj_idx, off;
- ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
- pool = kzalloc(ovhd_size, GFP_KERNEL);
- if (!pool)
- return NULL;
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+ struct page *pages[2];
+ void *ret;
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
- int size;
- struct size_class *class;
+ BUG_ON(!handle);
- size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
- if (size > ZS_MAX_ALLOC_SIZE)
- size = ZS_MAX_ALLOC_SIZE;
+ /*
+ * Because we use per-cpu mapping areas shared among the
+ * pools/users, we can't allow mapping in interrupt context
+ * because it can corrupt another users mappings.
+ */
+ BUG_ON(in_interrupt());
- class = &pool->size_class[i];
- class->size = size;
- class->index = i;
- spin_lock_init(&class->lock);
- class->pages_per_zspage = get_pages_per_zspage(size);
+ /* From now on, migration cannot move the object */
+ pin_tag(handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+ area = &get_cpu_var(zs_map_area);
+ area->vm_mm = mm;
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ area->vm_addr = kmap_atomic(page);
+ ret = area->vm_addr + off;
+ goto out;
}
- pool->flags = flags;
+ /* this object spans two pages */
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
- return pool;
+ ret = __zs_map_object(area, pages, off, class->size);
+out:
+ if (!class->huge)
+ ret += ZS_HANDLE_SIZE;
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(zs_create_pool);
+EXPORT_SYMBOL_GPL(zs_map_object);
-void zs_destroy_pool(struct zs_pool *pool)
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
- int i;
+ struct page *page;
+ unsigned long obj, obj_idx, off;
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
- int fg;
- struct size_class *class = &pool->size_class[i];
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
- for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
- if (class->fullness_list[fg]) {
- pr_info("Freeing non-empty class with size %db, fullness group %d\n",
- class->size, fg);
- }
- }
+ BUG_ON(!handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = this_cpu_ptr(&zs_map_area);
+ if (off + class->size <= PAGE_SIZE)
+ kunmap_atomic(area->vm_addr);
+ else {
+ struct page *pages[2];
+
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ __zs_unmap_object(area, pages, off, class->size);
}
- kfree(pool);
+ put_cpu_var(zs_map_area);
+ unpin_tag(handle);
}
-EXPORT_SYMBOL_GPL(zs_destroy_pool);
+EXPORT_SYMBOL_GPL(zs_unmap_object);
+
+static unsigned long obj_malloc(struct page *first_page,
+ struct size_class *class, unsigned long handle)
+{
+ unsigned long obj;
+ struct link_free *link;
+
+ struct page *m_page;
+ unsigned long m_objidx, m_offset;
+ void *vaddr;
+
+ handle |= OBJ_ALLOCATED_TAG;
+ obj = (unsigned long)first_page->freelist;
+ obj_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ if (!class->huge)
+ /* record handle in the header of allocated chunk */
+ link->handle = handle;
+ else
+ /* record handle in first_page->private */
+ set_page_private(first_page, handle);
+ kunmap_atomic(vaddr);
+ first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
+
+ return obj;
+}
+
/**
* zs_malloc - Allocate block of given size from pool.
@@ -997,20 +1385,20 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
- unsigned long obj;
- struct link_free *link;
- int class_idx;
+ unsigned long handle, obj;
struct size_class *class;
-
- struct page *first_page, *m_page;
- unsigned long m_objidx, m_offset;
+ struct page *first_page;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- class_idx = get_size_class_index(size);
- class = &pool->size_class[class_idx];
- BUG_ON(class_idx != class->index);
+ handle = alloc_handle(pool);
+ if (!handle)
+ return 0;
+
+ /* extra space in chunk to keep the handle */
+ size += ZS_HANDLE_SIZE;
+ class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
first_page = find_get_zspage(class);
@@ -1018,170 +1406,539 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (!first_page) {
spin_unlock(&class->lock);
first_page = alloc_zspage(class, pool->flags);
- if (unlikely(!first_page))
+ if (unlikely(!first_page)) {
+ free_handle(pool, handle);
return 0;
+ }
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated);
+
spin_lock(&class->lock);
+ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
}
- obj = (unsigned long)first_page->freelist;
- obj_handle_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-
- link = (struct link_free *)kmap_atomic(m_page) +
- m_offset / sizeof(*link);
- first_page->freelist = link->next;
- memset(link, POISON_INUSE, sizeof(*link));
- kunmap_atomic(link);
-
- first_page->inuse++;
+ obj = obj_malloc(first_page, class, handle);
/* Now move the zspage to another fullness group, if required */
- fix_fullness_group(pool, first_page);
+ fix_fullness_group(class, first_page);
+ record_obj(handle, obj);
spin_unlock(&class->lock);
- return obj;
+ return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-void zs_free(struct zs_pool *pool, unsigned long obj)
+static void obj_free(struct zs_pool *pool, struct size_class *class,
+ unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
-
+ void *vaddr;
int class_idx;
- struct size_class *class;
enum fullness_group fullness;
- if (unlikely(!obj))
- return;
+ BUG_ON(!obj);
- obj_handle_to_location(obj, &f_page, &f_objidx);
+ obj &= ~OBJ_ALLOCATED_TAG;
+ obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
get_zspage_mapping(first_page, &class_idx, &fullness);
- class = &pool->size_class[class_idx];
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
- spin_lock(&class->lock);
+ vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
- link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
- + f_offset);
+ link = (struct link_free *)(vaddr + f_offset);
link->next = first_page->freelist;
- kunmap_atomic(link);
+ if (class->huge)
+ set_page_private(first_page, 0);
+ kunmap_atomic(vaddr);
first_page->freelist = (void *)obj;
-
first_page->inuse--;
- fullness = fix_fullness_group(pool, first_page);
- spin_unlock(&class->lock);
+ zs_stat_dec(class, OBJ_USED, 1);
+}
+
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *first_page, *f_page;
+ unsigned long obj, f_objidx;
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!handle))
+ return;
+ pin_tag(handle);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ obj_free(pool, class, obj);
+ fullness = fix_fullness_group(class, first_page);
if (fullness == ZS_EMPTY) {
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
free_zspage(first_page);
}
+ spin_unlock(&class->lock);
+ unpin_tag(handle);
+
+ free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
+static void zs_object_copy(unsigned long src, unsigned long dst,
+ struct size_class *class)
+{
+ struct page *s_page, *d_page;
+ unsigned long s_objidx, d_objidx;
+ unsigned long s_off, d_off;
+ void *s_addr, *d_addr;
+ int s_size, d_size, size;
+ int written = 0;
+
+ s_size = d_size = class->size;
+
+ obj_to_location(src, &s_page, &s_objidx);
+ obj_to_location(dst, &d_page, &d_objidx);
+
+ s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+ d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+
+ if (s_off + class->size > PAGE_SIZE)
+ s_size = PAGE_SIZE - s_off;
+
+ if (d_off + class->size > PAGE_SIZE)
+ d_size = PAGE_SIZE - d_off;
+
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+
+ while (1) {
+ size = min(s_size, d_size);
+ memcpy(d_addr + d_off, s_addr + s_off, size);
+ written += size;
+
+ if (written == class->size)
+ break;
+
+ s_off += size;
+ s_size -= size;
+ d_off += size;
+ d_size -= size;
+
+ if (s_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+ s_page = get_next_page(s_page);
+ BUG_ON(!s_page);
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+ s_size = class->size - written;
+ s_off = 0;
+ }
+
+ if (d_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ d_page = get_next_page(d_page);
+ BUG_ON(!d_page);
+ d_addr = kmap_atomic(d_page);
+ d_size = class->size - written;
+ d_off = 0;
+ }
+ }
+
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+ struct size_class *class)
+{
+ unsigned long head;
+ int offset = 0;
+ unsigned long handle = 0;
+ void *addr = kmap_atomic(page);
+
+ if (!is_first_page(page))
+ offset = page->index;
+ offset += class->size * index;
+
+ while (offset < PAGE_SIZE) {
+ head = obj_to_head(class, page, addr + offset);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (trypin_tag(handle))
+ break;
+ handle = 0;
+ }
+
+ offset += class->size;
+ index++;
+ }
+
+ kunmap_atomic(addr);
+ return handle;
+}
+
+struct zs_compact_control {
+ /* Source page for migration which could be a subpage of zspage. */
+ struct page *s_page;
+ /* Destination page for migration which should be a first page
+ * of zspage. */
+ struct page *d_page;
+ /* Starting object index within @s_page which used for live object
+ * in the subpage. */
+ int index;
+ /* how many of objects are migrated */
+ int nr_migrated;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
+{
+ unsigned long used_obj, free_obj;
+ unsigned long handle;
+ struct page *s_page = cc->s_page;
+ struct page *d_page = cc->d_page;
+ unsigned long index = cc->index;
+ int nr_migrated = 0;
+ int ret = 0;
+
+ while (1) {
+ handle = find_alloced_obj(s_page, index, class);
+ if (!handle) {
+ s_page = get_next_page(s_page);
+ if (!s_page)
+ break;
+ index = 0;
+ continue;
+ }
+
+ /* Stop if there is no more space */
+ if (zspage_full(d_page)) {
+ unpin_tag(handle);
+ ret = -ENOMEM;
+ break;
+ }
+
+ used_obj = handle_to_obj(handle);
+ free_obj = obj_malloc(d_page, class, handle);
+ zs_object_copy(used_obj, free_obj, class);
+ index++;
+ record_obj(handle, free_obj);
+ unpin_tag(handle);
+ obj_free(pool, class, used_obj);
+ nr_migrated++;
+ }
+
+ /* Remember last position in this iteration */
+ cc->s_page = s_page;
+ cc->index = index;
+ cc->nr_migrated = nr_migrated;
+
+ return ret;
+}
+
+static struct page *alloc_target_page(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page) {
+ remove_zspage(page, class, i);
+ break;
+ }
+ }
+
+ return page;
+}
+
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+ struct page *first_page)
+{
+ enum fullness_group fullness;
+
+ BUG_ON(!is_first_page(first_page));
+
+ fullness = get_fullness_group(first_page);
+ insert_zspage(first_page, class, fullness);
+ set_zspage_mapping(first_page, class->index, fullness);
+
+ if (fullness == ZS_EMPTY) {
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+
+ free_zspage(first_page);
+ }
+}
+
+static struct page *isolate_source_page(struct size_class *class)
+{
+ struct page *page;
+
+ page = class->fullness_list[ZS_ALMOST_EMPTY];
+ if (page)
+ remove_zspage(page, class, ZS_ALMOST_EMPTY);
+
+ return page;
+}
+
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
+{
+ int nr_to_migrate;
+ struct zs_compact_control cc;
+ struct page *src_page;
+ struct page *dst_page = NULL;
+ unsigned long nr_total_migrated = 0;
+
+ spin_lock(&class->lock);
+ while ((src_page = isolate_source_page(class))) {
+
+ BUG_ON(!is_first_page(src_page));
+
+ /* The goal is to migrate all live objects in source page */
+ nr_to_migrate = src_page->inuse;
+ cc.index = 0;
+ cc.s_page = src_page;
+
+ while ((dst_page = alloc_target_page(class))) {
+ cc.d_page = dst_page;
+ /*
+ * If there is no more space in dst_page, try to
+ * allocate another zspage.
+ */
+ if (!migrate_zspage(pool, class, &cc))
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ nr_total_migrated += cc.nr_migrated;
+ nr_to_migrate -= cc.nr_migrated;
+ }
+
+ /* Stop if we couldn't find slot */
+ if (dst_page == NULL)
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ putback_zspage(pool, class, src_page);
+ spin_unlock(&class->lock);
+ nr_total_migrated += cc.nr_migrated;
+ cond_resched();
+ spin_lock(&class->lock);
+ }
+
+ if (src_page)
+ putback_zspage(pool, class, src_page);
+
+ spin_unlock(&class->lock);
+
+ return nr_total_migrated;
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+ int i;
+ unsigned long nr_migrated = 0;
+ struct size_class *class;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+ nr_migrated += __zs_compact(pool, class);
+ }
+
+ return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
+
/**
- * zs_map_object - get address of allocated object from handle.
- * @pool: pool from which the object was allocated
- * @handle: handle returned from zs_malloc
- *
- * Before using an object allocated from zs_malloc, it must be mapped using
- * this function. When done with the object, it must be unmapped using
- * zs_unmap_object.
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
*
- * Only one object can be mapped per cpu at a time. There is no protection
- * against nested mappings.
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
*
- * This function returns with preemption and page faults disabled.
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
*/
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
{
- struct page *page;
- unsigned long obj_idx, off;
+ int i;
+ struct zs_pool *pool;
+ struct size_class *prev_class = NULL;
- unsigned int class_idx;
- enum fullness_group fg;
- struct size_class *class;
- struct mapping_area *area;
- struct page *pages[2];
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return NULL;
- BUG_ON(!handle);
+ pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
+ GFP_KERNEL);
+ if (!pool->size_class) {
+ kfree(pool);
+ return NULL;
+ }
+
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name)
+ goto err;
+
+ if (create_handle_cache(pool))
+ goto err;
/*
- * Because we use per-cpu mapping areas shared among the
- * pools/users, we can't allow mapping in interrupt context
- * because it can corrupt another users mappings.
+ * Iterate reversly, because, size of size_class that we want to use
+ * for merging should be larger or equal to current size.
*/
- BUG_ON(in_interrupt());
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ int size;
+ int pages_per_zspage;
+ struct size_class *class;
- obj_handle_to_location(handle, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+ if (size > ZS_MAX_ALLOC_SIZE)
+ size = ZS_MAX_ALLOC_SIZE;
+ pages_per_zspage = get_pages_per_zspage(size);
- area = &get_cpu_var(zs_map_area);
- area->vm_mm = mm;
- if (off + class->size <= PAGE_SIZE) {
- /* this object is contained entirely within a page */
- area->vm_addr = kmap_atomic(page);
- return area->vm_addr + off;
+ /*
+ * size_class is used for normal zsmalloc operation such
+ * as alloc/free for that size. Although it is natural that we
+ * have one size_class for each size, there is a chance that we
+ * can get more memory utilization if we use one size_class for
+ * many different sizes whose size_class have same
+ * characteristics. So, we makes size_class point to
+ * previous size_class if possible.
+ */
+ if (prev_class) {
+ if (can_merge(prev_class, size, pages_per_zspage)) {
+ pool->size_class[i] = prev_class;
+ continue;
+ }
+ }
+
+ class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
+ if (!class)
+ goto err;
+
+ class->size = size;
+ class->index = i;
+ class->pages_per_zspage = pages_per_zspage;
+ if (pages_per_zspage == 1 &&
+ get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+ class->huge = true;
+ spin_lock_init(&class->lock);
+ pool->size_class[i] = class;
+
+ prev_class = class;
}
- /* this object spans two pages */
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ pool->flags = flags;
+
+ if (zs_pool_stat_create(name, pool))
+ goto err;
+
+ return pool;
- return __zs_map_object(area, pages, off, class->size);
+err:
+ zs_destroy_pool(pool);
+ return NULL;
}
-EXPORT_SYMBOL_GPL(zs_map_object);
+EXPORT_SYMBOL_GPL(zs_create_pool);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+void zs_destroy_pool(struct zs_pool *pool)
{
- struct page *page;
- unsigned long obj_idx, off;
+ int i;
- unsigned int class_idx;
- enum fullness_group fg;
- struct size_class *class;
- struct mapping_area *area;
+ zs_pool_stat_destroy(pool);
- BUG_ON(!handle);
+ for (i = 0; i < zs_size_classes; i++) {
+ int fg;
+ struct size_class *class = pool->size_class[i];
- obj_handle_to_location(handle, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ if (!class)
+ continue;
- area = this_cpu_ptr(&zs_map_area);
- if (off + class->size <= PAGE_SIZE)
- kunmap_atomic(area->vm_addr);
- else {
- struct page *pages[2];
+ if (class->index != i)
+ continue;
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+ if (class->fullness_list[fg]) {
+ pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+ class->size, fg);
+ }
+ }
+ kfree(class);
+ }
- __zs_unmap_object(area, pages, off, class->size);
+ destroy_handle_cache(pool);
+ kfree(pool->size_class);
+ kfree(pool->name);
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+
+static int __init zs_init(void)
+{
+ int ret = zs_register_cpu_notifier();
+
+ if (ret)
+ goto notifier_fail;
+
+ init_zs_size_classes();
+
+#ifdef CONFIG_ZPOOL
+ zpool_register_driver(&zs_zpool_driver);
+#endif
+
+ ret = zs_stat_init();
+ if (ret) {
+ pr_err("zs stat initialization failed\n");
+ goto stat_fail;
}
- put_cpu_var(zs_map_area);
+ return 0;
+
+stat_fail:
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+ zs_unregister_cpu_notifier();
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(zs_unmap_object);
-unsigned long zs_get_total_pages(struct zs_pool *pool)
+static void __exit zs_exit(void)
{
- return atomic_long_read(&pool->pages_allocated);
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+ zs_unregister_cpu_notifier();
+
+ zs_stat_exit();
}
-EXPORT_SYMBOL_GPL(zs_get_total_pages);
module_init(zs_init);
module_exit(zs_exit);
diff --git a/mm/zswap.c b/mm/zswap.c
index ea064c1a09ba..e2f8c7e3c643 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -907,11 +907,12 @@ static int __init init_zswap(void)
pr_info("loading zswap\n");
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+ &zswap_zpool_ops);
if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
pr_info("%s zpool not available\n", zswap_zpool_type);
zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
&zswap_zpool_ops);
}
if (!zswap_pool) {