From eba3ff8b99863bcc9e66b8d528e4750229e29693 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 9 Feb 2009 12:05:49 -0800 Subject: xen: add xen_set_domain_pte() Add xen_set_domain_pte() to allow setting a pte mapping a page from another domain. The common case is to map from DOMID_IO, the pseudo domain which owns all IO pages, but will also be used in the privcmd interface to map other domain pages. [ Impact: new Xen-internal API for cross-domain mappings ] Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42086ac406a..1ceb0f2fa0a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -395,7 +395,7 @@ static bool xen_iomap_pte(pte_t pte) return pte_flags(pte) & _PAGE_IOMAP; } -static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) { struct multicall_space mcs; struct mmu_update *u; @@ -407,10 +407,16 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) u->ptr = arbitrary_virt_to_machine(ptep).maddr; u->val = pte_val_ma(pteval); - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); xen_mc_issue(PARAVIRT_LAZY_MMU); } +EXPORT_SYMBOL_GPL(xen_set_domain_pte); + +static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +{ + xen_set_domain_pte(ptep, pteval, DOMID_IO); +} static void xen_extend_mmu_update(const struct mmu_update *update) { -- cgit v1.2.3 From de1ef2065c4675ab1062ebc8d1cb6c5f42b61d04 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 21 May 2009 10:09:46 +0100 Subject: xen/privcmd: move remap_domain_mfn_range() to core xen code and export. This allows xenfs to be built as a module, previously it required flush_tlb_all and arbitrary_virt_to_machine to be exported. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 1ceb0f2fa0a..f08ea045620 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2265,6 +2265,72 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +#define REMAP_BATCH_SIZE 16 + +struct remap_data { + unsigned long mfn; + pgprot_t prot; + struct mmu_update *mmu_update; +}; + +static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + struct remap_data *rmd = data; + pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); + + rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; + rmd->mmu_update->val = pte_val_ma(pte); + rmd->mmu_update++; + + return 0; +} + +int xen_remap_domain_mfn_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long mfn, int nr, + pgprot_t prot, unsigned domid) +{ + struct remap_data rmd; + struct mmu_update mmu_update[REMAP_BATCH_SIZE]; + int batch; + unsigned long range; + int err = 0; + + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + + rmd.mfn = mfn; + rmd.prot = prot; + + while (nr) { + batch = min(REMAP_BATCH_SIZE, nr); + range = (unsigned long)batch << PAGE_SHIFT; + + rmd.mmu_update = mmu_update; + err = apply_to_page_range(vma->vm_mm, addr, range, + remap_area_mfn_pte_fn, &rmd); + if (err) + goto out; + + err = -EFAULT; + if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + goto out; + + nr -= batch; + addr += range; + } + + err = 0; +out: + + flush_tlb_all(); + + return err; +} +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; -- cgit v1.2.3 From a171ce6e7b4d967b9f9b8ba7c076a8a6d26e432b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 15:04:48 -0700 Subject: xen: dynamically allocate p2m space Use early brk mechanism to allocate p2m tables, to save memory when booting non-Xen. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42086ac406a..ecbdcf0d45d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -174,18 +174,16 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) /* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = - { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = - { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES); /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES); -static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] - __page_aligned_bss; +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, + (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); static inline unsigned p2m_top_index(unsigned long pfn) { @@ -209,7 +207,7 @@ void xen_build_mfn_list_list(void) p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@ -230,6 +228,22 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; + unsigned i; + + p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, + PAGE_SIZE); + for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + p2m_missing[i] = ~0UL; + + p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES, + PAGE_SIZE); + for (i = 0; i < TOP_ENTRIES; i++) + p2m_top[i] = p2m_missing; + + p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE); + p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) * + (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE), + PAGE_SIZE); for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); -- cgit v1.2.3 From a2e875298729540300a9a0324ee66e3b7883a912 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 16:08:31 -0700 Subject: xen: allocate p2m size based on actual max size Allocate p2m tables based on the actual runtime maximum pfn rather than the static config-time limit. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ecbdcf0d45d..151813d9755 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -169,25 +169,27 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) +static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES; -#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) +#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define TOP_ENTRIES(pages) ((pages) / P2M_ENTRIES_PER_PAGE) +#define MAX_TOP_ENTRIES TOP_ENTRIES(MAX_DOMAIN_PAGES) /* Placeholder for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); /* Array of pointers to pages containing p2m entries */ -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES); /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, - (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); + (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= MAX_DOMAIN_PAGES); + BUG_ON(pfn >= max_p2m_pfn); return pfn / P2M_ENTRIES_PER_PAGE; } @@ -201,13 +203,15 @@ void xen_build_mfn_list_list(void) { unsigned pfn, idx; - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) { + for (idx = 0; + idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE; + idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@ -230,19 +234,22 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned pfn; unsigned i; + max_p2m_pfn = max_pfn; + p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, PAGE_SIZE); for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p2m_missing[i] = ~0UL; - p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES, + p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn), PAGE_SIZE); - for (i = 0; i < TOP_ENTRIES; i++) + for (i = 0; i < TOP_ENTRIES(max_pfn); i++) p2m_top[i] = p2m_missing; - p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE); + p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn), + PAGE_SIZE); p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) * - (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE), + (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE), PAGE_SIZE); for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { @@ -258,7 +265,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) + if (unlikely(pfn >= max_p2m_pfn)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); @@ -304,7 +311,7 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + if (unlikely(pfn >= max_p2m_pfn)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } -- cgit v1.2.3 From f0991802bb4368e33848e7f823caa487d23555fb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 16:16:28 -0700 Subject: xen: use early_brk for level2_kernel_pgt Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 151813d9755..71c6af6c89a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1843,13 +1843,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, return pgd; } #else /* !CONFIG_X86_64 */ -static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; +static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + xen_start_info->nr_pt_frames * PAGE_SIZE + 512*1024); -- cgit v1.2.3 From 764f0138b9f54aa96761810055a74fce1e58c300 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 16:23:51 -0700 Subject: xen: allocate level1_ident_pgt Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 71c6af6c89a..3de42d1e475 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -138,7 +138,8 @@ static inline void check_zero(void) * large enough to allocate page table pages to allocate the rest. * Each page can map 2MB. */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; +#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) +static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ @@ -1718,6 +1719,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) unsigned ident_pte; unsigned long pfn; + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, + PAGE_SIZE); + ident_pte = 0; pfn = 0; for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { @@ -1728,7 +1732,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) pte_page = m2v(pmd[pmdidx].pmd); else { /* Check for free pte pages */ - if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + if (ident_pte == LEVEL1_IDENT_ENTRIES) break; pte_page = &level1_ident_pgt[ident_pte]; -- cgit v1.2.3 From b7eb4ad39134ee5b09634a710e50c2990f533231 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 17:06:58 -0700 Subject: xen: set shared_info->arch.max_pfn to max_p2m_pfn Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3de42d1e475..909ad637c38 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn_list); - HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; + HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ -- cgit v1.2.3 From 1f2d9dd309feb08fdbc711fa03841650dfff87d8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 17:11:35 -0700 Subject: xen: set the actual extent of the mfn_list_list Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 909ad637c38..fcff8c82979 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn_list); - HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn; + HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ -- cgit v1.2.3 From bbbf61eff92c7c236f57ee1953ad84055443717e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 17:12:17 -0700 Subject: xen: make install_p2mtop_page() static Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index fcff8c82979..00969099b05 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -275,8 +275,8 @@ unsigned long get_phys_to_machine(unsigned long pfn) } EXPORT_SYMBOL_GPL(get_phys_to_machine); -/* install a new p2m_top page */ -bool install_p2mtop_page(unsigned long pfn, unsigned long *p) +/* install a new p2m_top page */ +static bool install_p2mtop_page(unsigned long pfn, unsigned long *p) { unsigned topidx = p2m_top_index(pfn); unsigned long **pfnp, *mfnp; -- cgit v1.2.3 From 58e05027b530ff081ecea68e38de8d59db8f87e0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Aug 2010 13:28:48 -0700 Subject: xen: convert p2m to a 3 level tree Make the p2m structure a 3 level tree which covers the full possible physical space. The p2m structure contains mappings from the domain's pfns to system-wide mfns. The structure has 3 levels and two roots. The first root is for the domain's own use, and is linked with virtual addresses. The second is all mfn references, and is used by Xen on save/restore to allow it to update the p2m mapping for the domain. At boot, the domain builder provides a simple flat p2m array for all the initially present pages. We construct the two levels above that using the early_brk allocator. After early boot time, set_phys_to_machine() will allocate any missing levels using the normal kernel allocator (at GFP_KERNEL, so it must be called in a normal blocking context). Because the early_brk() API requires us to pre-reserve the maximum amount of memory we could allocate, there is still a CONFIG_XEN_MAX_DOMAIN_MEMORY config option, but its only negative side-effect is to increase the kernel's apparent bss size. However, since all unused brk memory is returned to the heap, there's no real downside to making it large. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 318 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 242 insertions(+), 76 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 00969099b05..d4c7265cf0a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -170,51 +170,162 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) -static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES; +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ -#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define TOP_ENTRIES(pages) ((pages) / P2M_ENTRIES_PER_PAGE) -#define MAX_TOP_ENTRIES TOP_ENTRIES(MAX_DOMAIN_PAGES) +static unsigned long max_p2m_pfn __read_mostly; -/* Placeholder for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - /* Array of pointers to pages containing p2m entries */ -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES); +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) -/* Arrays of p2m arrays expressed in mfns used for save/restore */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES); +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, - (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= max_p2m_pfn); - return pfn / P2M_ENTRIES_PER_PAGE; + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; } static inline unsigned p2m_index(unsigned long pfn) { - return pfn % P2M_ENTRIES_PER_PAGE; + return pfn % P2M_PER_PAGE; } -/* Build the parallel p2m_top_mfn structures */ +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ void xen_build_mfn_list_list(void) { - unsigned pfn, idx; + unsigned pfn, i; - for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); } - for (idx = 0; - idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE; - idx++) { - unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; - p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); + for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long mid_mfn; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + + /* Don't bother allocating any mfn mid levels if + they're just missing */ + if (mid[mididx] == p2m_missing) + continue; + + mid_mfn = p2m_top_mfn[topidx]; + mid_mfn_p = mfn_to_virt(mid_mfn); + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + mid_mfn = virt_to_mfn(mid_mfn_p); + + p2m_top_mfn[topidx] = mid_mfn; + } + + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } @@ -223,7 +334,7 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn_list); + virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; } @@ -233,99 +344,154 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - unsigned i; max_p2m_pfn = max_pfn; - p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, - PAGE_SIZE); - for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) - p2m_missing[i] = ~0UL; + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); - p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn), - PAGE_SIZE); - for (i = 0; i < TOP_ENTRIES(max_pfn); i++) - p2m_top[i] = p2m_missing; + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); - p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn), - PAGE_SIZE); - p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) * - (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE), - PAGE_SIZE); + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); - for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); - p2m_top[topidx] = &mfn_list[pfn]; + p2m_top[topidx] = mid; + } + + p2m_top[topidx][mididx] = &mfn_list[pfn]; } + /* Allocate and initialize top and mid mfn levels */ xen_build_mfn_list_list(); } unsigned long get_phys_to_machine(unsigned long pfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= max_p2m_pfn)) + if (unlikely(pfn >= MAX_P2M_PFN)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - return p2m_top[topidx][idx]; + + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); -/* install a new p2m_top page */ -static bool install_p2mtop_page(unsigned long pfn, unsigned long *p) +static void *alloc_p2m_page(void) { - unsigned topidx = p2m_top_index(pfn); - unsigned long **pfnp, *mfnp; - unsigned i; + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} - pfnp = &p2m_top[topidx]; - mfnp = &p2m_top_mfn[topidx]; +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} - for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) - p[i] = INVALID_P2M_ENTRY; +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; - if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { - *mfnp = virt_to_mfn(p); - return true; + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); } - return false; -} + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = mfn_to_virt(*top_mfn_p); -static void alloc_p2m(unsigned long pfn) -{ - unsigned long *p; + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + } - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; - if (!install_p2mtop_page(pfn, p)) - free_page((unsigned long)p); + p2m = alloc_p2m_page(); + if (!p2m) + return false; + + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; } /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= max_p2m_pfn)) { + if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_missing) { - if (mfn == INVALID_P2M_ENTRY) - return true; - return false; - } - + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - p2m_top[topidx][idx] = mfn; + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; return true; } @@ -338,7 +504,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) } if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - alloc_p2m(pfn); + WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn); if (!__set_phys_to_machine(pfn, mfn)) BUG(); -- cgit v1.2.3 From c3798062f100c3e1d4ae1241bc536f3b1f28a6ca Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Aug 2010 13:42:04 -0700 Subject: xen: add return value to set_phys_to_machine() set_phys_to_machine() can return false on failure, which means a memory allocation failure for the p2m structure. It can only fail if setting the mfn for a pfn in previously unused address space. It is guaranteed to succeed if you're setting a mapping to INVALID_P2M_ENTRY or updating the mfn for an existing pfn. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d4c7265cf0a..b9651343723 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -282,7 +282,7 @@ static void p2m_init(unsigned long *p2m) */ void xen_build_mfn_list_list(void) { - unsigned pfn, i; + unsigned pfn; /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { @@ -496,19 +496,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } -void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return; + return true; } if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn); + if (!alloc_p2m(pfn)) + return false; if (!__set_phys_to_machine(pfn, mfn)) - BUG(); + return false; } + + return true; } unsigned long arbitrary_virt_to_mfn(void *vaddr) -- cgit v1.2.3 From 33a847502b0338351cebd8fc0c68ac796cfadbbd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Aug 2010 15:18:19 -0700 Subject: xen: defer building p2m mfn structures until kernel is mapped When building mfn parts of p2m structure, we rely on being able to use mfn_to_virt, which in turn requires kernel to be mapped into the linear area (which is distinct from the kernel image mapping on 64-bit). Defer calling xen_build_mfn_list_list() until after xen_setup_kernel_pagetable(); Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b9651343723..9b43bb398d3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -374,9 +374,6 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top[topidx][mididx] = &mfn_list[pfn]; } - - /* Allocate and initialize top and mid mfn levels */ - xen_build_mfn_list_list(); } unsigned long get_phys_to_machine(unsigned long pfn) -- cgit v1.2.3 From cfd8951e082a589637f9de3c33efd3218fdb3c03 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 31 Aug 2010 14:06:22 -0700 Subject: xen: don't map missing memory When setting up a pte for a missing pfn (no matching mfn), just create an empty pte rather than a junk mapping. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9b43bb398d3..4c63b7f452d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -745,7 +745,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + unsigned long mfn = pfn_to_mfn(pfn); + + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } + + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; -- cgit v1.2.3 From 2f7acb208523a3bf5f1830f01c29f7feda045169 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 15 Sep 2010 13:32:49 -0700 Subject: xen: make sure xen_max_p2m_pfn is up to date Keep xen_max_p2m_pfn up to date with the end of the extra memory we're adding. It is possible that it will be too high since memory may be truncated by a "mem=" option on the kernel command line, but that won't matter. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4c63b7f452d..b2371671b11 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -195,7 +195,7 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ * 512 and 1024 entries respectively. */ -static unsigned long max_p2m_pfn __read_mostly; +unsigned long xen_max_p2m_pfn __read_mostly; #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) @@ -293,7 +293,7 @@ void xen_build_mfn_list_list(void) p2m_top_mfn_init(p2m_top_mfn); } - for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) { + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); unsigned mididx = p2m_mid_index(pfn); unsigned long **mid; @@ -335,7 +335,7 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -345,7 +345,7 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - max_p2m_pfn = max_pfn; + xen_max_p2m_pfn = max_pfn; p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_init(p2m_missing); -- cgit v1.2.3 From 41f2e4771a4f1ba26c35438daf32917b9ef7858d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 30 Mar 2010 11:47:40 -0700 Subject: xen: add support for PAT Convert Linux PAT entries into Xen ones when constructing ptes. Linux doesn't use _PAGE_PAT for ptes, so the only difference in the first 4 entries is that Linux uses _PAGE_PWT for WC, whereas Xen (and default) use it for WT. xen_pte_val does the inverse conversion. We hard-code assumptions about Linux's current PAT layout, but a warning on the wrmsr to MSR_IA32_CR_PAT should point out any problems. If necessary we could go to a more general table-based conversion between Linux and Xen PAT entries. hugetlbfs poses a problem at the moment, the x86 architecture uses the same flag for _PAGE_PAT and _PAGE_PSE, which changes meaning depending on which pagetable level we're using. At the moment this should be OK so long as nobody tries to do a pte_val on a hugetlbfs pte. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b2371671b11..67b41017f7b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -780,10 +781,18 @@ static pteval_t iomap_pte(pteval_t val) pteval_t xen_pte_val(pte_t pte) { - if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) - return pte.pte; + pteval_t pteval = pte.pte; - return pte_mfn_to_pfn(pte.pte); + /* If this is a WC pte, convert back from Xen WC to Linux WC */ + if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { + WARN_ON(!pat_enabled); + pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; + } + + if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) + return pteval; + + return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -793,10 +802,48 @@ pgdval_t xen_pgd_val(pgd_t pgd) } PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); +/* + * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 + * are reserved for now, to correspond to the Intel-reserved PAT + * types. + * + * We expect Linux's PAT set as follows: + * + * Idx PTE flags Linux Xen Default + * 0 WB WB WB + * 1 PWT WC WT WT + * 2 PCD UC- UC- UC- + * 3 PCD PWT UC UC UC + * 4 PAT WB WC WB + * 5 PAT PWT WC WP WT + * 6 PAT PCD UC- UC UC- + * 7 PAT PCD PWT UC UC UC + */ + +void xen_set_pat(u64 pat) +{ + /* We expect Linux to use a PAT setting of + * UC UC- WC WB (ignoring the PAT flag) */ + WARN_ON(pat != 0x0007010600070106ull); +} + pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); + /* If Linux is trying to set a WC pte, then map to the Xen WC. + * If _PAGE_PAT is set, then it probably means it is really + * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope + * things work out OK... + * + * (We should never see kernel mappings with _PAGE_PSE set, + * but we could see hugetlbfs mappings, I think.). + */ + if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { + if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) + pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; + } + /* * Unprivileged domains are allowed to do IOMAPpings for * PCI passthrough, but not map ISA space. The ISA -- cgit v1.2.3 From 375b2a9ada6d105483aab22f1af1d727bc3c418d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 21 Oct 2010 11:00:46 +0100 Subject: xen: correctly rebuild mfn list list after migration. Otherwise the second migration attempt fails because the mfn_list_list still refers to all the old mfns. We need to update the entires in both p2m_top_mfn and the mid_mfn pages which p2m_top_mfn refers to. In order to do this we need to keep track of the virtual addresses mapping the p2m_mid_mfn pages since we cannot rely on mfn_to_virt(p2m_top_mfn[idx]) since p2m_top_mfn[idx] will still contain the old MFN after a migration, which may now belong to another domain and hence have a different mapping in the m2p. Therefore add and maintain a third top level page, p2m_top_mfn_p[], which tracks the virtual addresses of the mfns contained in p2m_top_mfn[]. We also need to update the content of the p2m_mid_missing_mfn page on resume to refer to the page's new mfn. p2m_missing does not need updating since the migration process takes care of the leaf p2m pages for us. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 67b41017f7b..e41683cf290 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -187,6 +187,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ * / \ / \ / / * p2m p2m p2m p2m p2m p2m p2m ... * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the * maximum representable pseudo-physical address space is: * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages @@ -211,6 +213,7 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); @@ -247,6 +250,14 @@ static void p2m_top_mfn_init(unsigned long *top) top[i] = virt_to_mfn(p2m_mid_missing_mfn); } +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + static void p2m_mid_init(unsigned long **mid) { unsigned i; @@ -283,33 +294,43 @@ static void p2m_init(unsigned long *p2m) */ void xen_build_mfn_list_list(void) { - unsigned pfn; + unsigned long pfn; /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); unsigned mididx = p2m_mid_index(pfn); unsigned long **mid; - unsigned long mid_mfn; unsigned long *mid_mfn_p; mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; /* Don't bother allocating any mfn mid levels if - they're just missing */ - if (mid[mididx] == p2m_missing) + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; continue; - - mid_mfn = p2m_top_mfn[topidx]; - mid_mfn_p = mfn_to_virt(mid_mfn); + } if (mid_mfn_p == p2m_mid_missing_mfn) { /* @@ -321,11 +342,10 @@ void xen_build_mfn_list_list(void) mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(mid_mfn_p); - mid_mfn = virt_to_mfn(mid_mfn_p); - - p2m_top_mfn[topidx] = mid_mfn; + p2m_top_mfn_p[topidx] = mid_mfn_p; } + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } @@ -344,7 +364,7 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned pfn; + unsigned long pfn; xen_max_p2m_pfn = max_pfn; @@ -434,7 +454,9 @@ static bool alloc_p2m(unsigned long pfn) } top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = mfn_to_virt(*top_mfn_p); + mid_mfn = p2m_top_mfn_p[topidx]; + + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); if (mid_mfn == p2m_mid_missing_mfn) { /* Separately check the mid mfn level */ @@ -446,11 +468,13 @@ static bool alloc_p2m(unsigned long pfn) return false; p2m_mid_mfn_init(mid_mfn); - + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; } if (p2m_top[topidx][mididx] == p2m_missing) { -- cgit v1.2.3