aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorAndy Whitcroft <apw@shadowen.org>2005-06-23 00:07:54 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-23 09:45:04 -0700
commitd41dee369bff3b9dcb6328d4d822926c28cc2594 (patch)
treea0405f3b7af3ebca21838a7d427bd75a067bf850 /mm
parentaf705362ab6018071310c5fcd436a6b457517d5f (diff)
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of mem_map[] is needed by discontiguous memory machines (like in the old CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually become a complete replacement. A significant advantage over DISCONTIGMEM is that it's completely separated from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA and DISCONTIG are often confused. Another advantage is that sparse doesn't require each NUMA node's ranges to be contiguous. It can handle overlapping ranges between nodes with no problems, where DISCONTIGMEM currently throws away that memory. Sparsemem uses an array to provide different pfn_to_page() translations for each SECTION_SIZE area of physical memory. This is what allows the mem_map[] to be chopped up. In order to do quick pfn_to_page() operations, the section number of the page is encoded in page->flags. Part of the sparsemem infrastructure enables sharing of these bits more dynamically (at compile-time) between the page_zone() and sparsemem operations. However, on 32-bit architectures, the number of bits is quite limited, and may require growing the size of the page->flags type in certain conditions. Several things might force this to occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of memory), an increase in the physical address space, or an increase in the number of used page->flags. One thing to note is that, once sparsemem is present, the NUMA node information no longer needs to be stored in the page->flags. It might provide speed increases on certain platforms and will be stored there if there is room. But, if out of room, an alternate (theoretically slower) mechanism is used. This patch introduces CONFIG_FLATMEM. It is used in almost all cases where there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM often have to compile out the same areas of code. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Martin Bligh <mbligh@aracnet.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com> Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig38
-rw-r--r--mm/Makefile1
-rw-r--r--mm/bootmem.c9
-rw-r--r--mm/memory.c2
-rw-r--r--mm/page_alloc.c39
-rw-r--r--mm/sparse.c85
6 files changed, 159 insertions, 15 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5127441561b..cd379936cac 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -6,6 +6,7 @@ choice
prompt "Memory model"
depends on SELECT_MEMORY_MODEL
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+ default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
config FLATMEM_MANUAL
@@ -17,7 +18,15 @@ config FLATMEM_MANUAL
only have one option here: FLATMEM. This is normal
and a correct option.
- If unsure, choose this option over any other.
+ Some users of more advanced features like NUMA and
+ memory hotplug may have different options here.
+ DISCONTIGMEM is an more mature, better tested system,
+ but is incompatible with memory hotplug and may suffer
+ decreased performance over SPARSEMEM. If unsure between
+ "Sparse Memory" and "Discontiguous Memory", choose
+ "Discontiguous Memory".
+
+ If unsure, choose this option (Flat Memory) over any other.
config DISCONTIGMEM_MANUAL
bool "Discontigious Memory"
@@ -35,15 +44,38 @@ config DISCONTIGMEM_MANUAL
If unsure, choose "Flat Memory" over this option.
+config SPARSEMEM_MANUAL
+ bool "Sparse Memory"
+ depends on ARCH_SPARSEMEM_ENABLE
+ help
+ This will be the only option for some systems, including
+ memory hotplug systems. This is normal.
+
+ For many other systems, this will be an alternative to
+ "Discontigious Memory". This option provides some potential
+ performance benefits, along with decreased code complexity,
+ but it is newer, and more experimental.
+
+ If unsure, choose "Discontiguous Memory" or "Flat Memory"
+ over this option.
+
endchoice
config DISCONTIGMEM
def_bool y
depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
+config SPARSEMEM
+ def_bool y
+ depends on SPARSEMEM_MANUAL
+
config FLATMEM
def_bool y
- depends on !DISCONTIGMEM || FLATMEM_MANUAL
+ depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+
+config FLAT_NODE_MEM_MAP
+ def_bool y
+ depends on !SPARSEMEM
#
# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
@@ -56,4 +88,4 @@ config NEED_MULTIPLE_NODES
config HAVE_MEMORY_PRESENT
def_bool y
- depends on ARCH_HAVE_MEMORY_PRESENT
+ depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6..8f70ffd763c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
+obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d..f82f7aebbee 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -256,6 +256,7 @@ found:
static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
{
struct page *page;
+ unsigned long pfn;
bootmem_data_t *bdata = pgdat->bdata;
unsigned long i, count, total = 0;
unsigned long idx;
@@ -266,7 +267,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
count = 0;
/* first extant page of the node */
- page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+ pfn = bdata->node_boot_start >> PAGE_SHIFT;
idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
map = bdata->node_bootmem_map;
/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +276,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
gofast = 1;
for (i = 0; i < idx; ) {
unsigned long v = ~map[i / BITS_PER_LONG];
+
if (gofast && v == ~0UL) {
int j, order;
+ page = pfn_to_page(pfn);
count += BITS_PER_LONG;
__ClearPageReserved(page);
order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +295,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
page += BITS_PER_LONG;
} else if (v) {
unsigned long m;
+
+ page = pfn_to_page(pfn);
for (m = 1; m && i < idx; m<<=1, page++, i++) {
if (v & m) {
count++;
@@ -302,8 +307,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
}
} else {
i+=BITS_PER_LONG;
- page += BITS_PER_LONG;
}
+ pfn += BITS_PER_LONG;
}
total += count;
diff --git a/mm/memory.c b/mm/memory.c
index da91b7bf998..30975ef4872 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
#include <linux/swapops.h>
#include <linux/elf.h>
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20e239599db..5c1b8982a6d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
* Used by page_zone() to look up the address of the struct zone whose
* id is encoded in the upper bits of page->flags
*/
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -1649,11 +1649,15 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
- struct page *start = pfn_to_page(start_pfn);
struct page *page;
+ int end_pfn = start_pfn + size;
+ int pfn;
- for (page = start; page < (start + size); page++) {
- set_page_links(page, zone, nid);
+ for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ set_page_links(page, zone, nid, pfn);
set_page_count(page, 0);
reset_page_mapcount(page);
SetPageReserved(page);
@@ -1677,6 +1681,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
}
}
+#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long snum = pfn_to_section_nr(pfn);
+ unsigned long end = pfn_to_section_nr(pfn + size);
+
+ if (FLAGS_HAS_NODE)
+ zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+ else
+ for (; snum <= end; snum++)
+ zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1861,7 +1879,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
unsigned long size, realsize;
unsigned long batch;
- zone_table[NODEZONE(nid, j)] = zone;
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
@@ -1927,6 +1944,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
memmap_init(size, nid, j, zone_start_pfn);
+ zonetable_add(zone, nid, j, zone_start_pfn, size);
+
zone_start_pfn += size;
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1935,28 +1954,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
- unsigned long size;
- struct page *map;
-
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
+ unsigned long size;
+ struct page *map;
+
size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
map = alloc_bootmem_node(pgdat, size);
pgdat->node_mem_map = map;
}
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0))
mem_map = NODE_DATA(0)->node_mem_map;
#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
void __init free_area_init_node(int nid, struct pglist_data *pgdat,
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 00000000000..f888385b9e1
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,85 @@
+/*
+ * sparse memory mappings.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+
+/*
+ * Permanent SPARSEMEM data:
+ *
+ * 1) mem_section - memory sections, mem_map's for valid memory
+ */
+struct mem_section mem_section[NR_MEM_SECTIONS];
+EXPORT_SYMBOL(mem_section);
+
+/* Record a memory area against a node. */
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+
+ start &= PAGE_SECTION_MASK;
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ unsigned long section = pfn_to_section_nr(pfn);
+ if (!mem_section[section].section_mem_map)
+ mem_section[section].section_mem_map = (void *) -1;
+ }
+}
+
+/*
+ * Only used by the i386 NUMA architecures, but relatively
+ * generic code.
+ */
+unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn;
+ unsigned long nr_pages = 0;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ if (nid != early_pfn_to_nid(pfn))
+ continue;
+
+ if (pfn_valid(pfn))
+ nr_pages += PAGES_PER_SECTION;
+ }
+
+ return nr_pages * sizeof(struct page);
+}
+
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void sparse_init(void)
+{
+ unsigned long pnum;
+ struct page *map;
+ int nid;
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ if (!mem_section[pnum].section_mem_map)
+ continue;
+
+ nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+ map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+ if (!map)
+ map = alloc_bootmem_node(NODE_DATA(nid),
+ sizeof(struct page) * PAGES_PER_SECTION);
+ if (!map) {
+ mem_section[pnum].section_mem_map = 0;
+ continue;
+ }
+
+ /*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+ mem_section[pnum].section_mem_map = map -
+ section_nr_to_pfn(pnum);
+ }
+}