/* * Xen leaves the responsibility for maintaining p2m mappings to the * guests themselves, but it must also access and update the p2m array * during suspend/resume when all the pages are reallocated. * * The p2m table is logically a flat array, but we implement it as a * three-level tree to allow the address space to be sparse. * * Xen * | * p2m_top p2m_top_mfn * / \ / \ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn * / \ / \ / / * p2m p2m p2m p2m p2m p2m p2m ... * * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. * * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the * maximum representable pseudo-physical address space is: * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages * * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to * 512 and 1024 entries respectively. */ #include #include #include #include #include #include #include #include #include #include #include "xen-ops.h" static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); } static inline unsigned p2m_mid_index(unsigned long pfn) { return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; } static inline unsigned p2m_index(unsigned long pfn) { return pfn % P2M_PER_PAGE; } static void p2m_top_init(unsigned long ***top) { unsigned i; for (i = 0; i < P2M_TOP_PER_PAGE; i++) top[i] = p2m_mid_missing; } static void p2m_top_mfn_init(unsigned long *top) { unsigned i; for (i = 0; i < P2M_TOP_PER_PAGE; i++) top[i] = virt_to_mfn(p2m_mid_missing_mfn); } static void p2m_top_mfn_p_init(unsigned long **top) { unsigned i; for (i = 0; i < P2M_TOP_PER_PAGE; i++) top[i] = p2m_mid_missing_mfn; } static void p2m_mid_init(unsigned long **mid) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) mid[i] = p2m_missing; } static void p2m_mid_mfn_init(unsigned long *mid) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) mid[i] = virt_to_mfn(p2m_missing); } static void p2m_init(unsigned long *p2m) { unsigned i; for (i = 0; i < P2M_MID_PER_PAGE; i++) p2m[i] = INVALID_P2M_ENTRY; } /* * Build the parallel p2m_top_mfn and p2m_mid_mfn structures * * This is called both at boot time, and after resuming from suspend: * - At boot time we're called very early, and must use extend_brk() * to allocate memory. * * - After resume we're called from within stop_machine, but the mfn * tree should alreay be completely allocated. */ void xen_build_mfn_list_list(void) { unsigned long pfn; /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(p2m_mid_missing_mfn); p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ p2m_mid_mfn_init(p2m_mid_missing_mfn); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); unsigned mididx = p2m_mid_index(pfn); unsigned long **mid; unsigned long *mid_mfn_p; mid = p2m_top[topidx]; mid_mfn_p = p2m_top_mfn_p[topidx]; /* Don't bother allocating any mfn mid levels if * they're just missing, just update the stored mfn, * since all could have changed over a migrate. */ if (mid == p2m_mid_missing) { BUG_ON(mididx); BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; continue; } if (mid_mfn_p == p2m_mid_missing_mfn) { /* * XXX boot-time only! We should never find * missing parts of the mfn tree after * runtime. extend_brk() will BUG if we call * it too late. */ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(mid_mfn_p); p2m_top_mfn_p[topidx] = mid_mfn_p; } p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } void xen_setup_mfn_list_list(void) { BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ void __init xen_build_dynamic_phys_to_machine(void) { unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned long pfn; xen_max_p2m_pfn = max_pfn; p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_init(p2m_missing); p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(p2m_mid_missing); p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just * need to graft that into our tree structure. */ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); unsigned mididx = p2m_mid_index(pfn); if (p2m_top[topidx] == p2m_mid_missing) { unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(mid); p2m_top[topidx] = mid; } /* * As long as the mfn_list has enough entries to completely * fill a p2m page, pointing into the array is ok. But if * not the entries beyond the last pfn will be undefined. * And guessing that the 'what-ever-there-is' does not take it * too kindly when changing it to invalid markers, a new page * is allocated, initialized and filled with the valid part. */ if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { unsigned long p2midx; unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_init(p2m); for (p2midx = 0; pfn + p2midx < max_pfn; p2midx++) { p2m[p2midx] = mfn_list[pfn + p2midx]; } p2m_top[topidx][mididx] = p2m; } else p2m_top[topidx][mididx] = &mfn_list[pfn]; } m2p_override_init(); } unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, mididx, idx; if (unlikely(pfn >= MAX_P2M_PFN)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); static void *alloc_p2m_page(void) { return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); } static void free_p2m_page(void *p) { free_page((unsigned long)p); } /* * Fully allocate the p2m structure for a given pfn. We need to check * that both the top and mid levels are allocated, and make sure the * parallel mfn tree is kept in sync. We may race with other cpus, so * the new pages are installed with cmpxchg; if we lose the race then * simply free the page we allocated and use the one that's there. */ static bool alloc_p2m(unsigned long pfn) { unsigned topidx, mididx; unsigned long ***top_p, **mid; unsigned long *top_mfn_p, *mid_mfn; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); top_p = &p2m_top[topidx]; mid = *top_p; if (mid == p2m_mid_missing) { /* Mid level is missing, allocate a new one */ mid = alloc_p2m_page(); if (!mid) return false; p2m_mid_init(mid); if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) free_p2m_page(mid); } top_mfn_p = &p2m_top_mfn[topidx]; mid_mfn = p2m_top_mfn_p[topidx]; BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); if (mid_mfn == p2m_mid_missing_mfn) { /* Separately check the mid mfn level */ unsigned long missing_mfn; unsigned long mid_mfn_mfn; mid_mfn = alloc_p2m_page(); if (!mid_mfn) return false; p2m_mid_mfn_init(mid_mfn); missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) free_p2m_page(mid_mfn); else p2m_top_mfn_p[topidx] = mid_mfn; } if (p2m_top[topidx][mididx] == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; p2m = alloc_p2m_page(); if (!p2m) return false; p2m_init(p2m); if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) free_p2m_page(p2m); else mid_mfn[mididx] = virt_to_mfn(p2m); } return true; } /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, mididx, idx; if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); if (p2m_top[topidx][mididx] == p2m_missing) return mfn == INVALID_P2M_ENTRY; p2m_top[topidx][mididx][idx] = mfn; return true; } bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); return true; } if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!alloc_p2m(pfn)) return false; if (!__set_phys_to_machine(pfn, mfn)) return false; } return true; } #define M2P_OVERRIDE_HASH_SHIFT 10 #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); static DEFINE_SPINLOCK(m2p_override_lock); static void __init m2p_override_init(void) { unsigned i; m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, sizeof(unsigned long)); for (i = 0; i < M2P_OVERRIDE_HASH; i++) INIT_LIST_HEAD(&m2p_overrides[i]); } static unsigned long mfn_hash(unsigned long mfn) { return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); } /* Add an MFN override for a particular page */ int m2p_add_override(unsigned long mfn, struct page *page) { unsigned long flags; unsigned long pfn; unsigned long address; unsigned level; pte_t *ptep = NULL; pfn = page_to_pfn(page); if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); ptep = lookup_address(address, &level); if (WARN(ptep == NULL || level != PG_LEVEL_4K, "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; } page->private = mfn; page->index = pfn_to_mfn(pfn); __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); if (!PageHighMem(page)) /* Just zap old mapping for now */ pte_clear(&init_mm, address, ptep); spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); return 0; } int m2p_remove_override(struct page *page) { unsigned long flags; unsigned long mfn; unsigned long pfn; unsigned long address; unsigned level; pte_t *ptep = NULL; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) return -EINVAL; if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); ptep = lookup_address(address, &level); if (WARN(ptep == NULL || level != PG_LEVEL_4K, "m2p_remove_override: pfn %lx not mapped", pfn)) return -EINVAL; } spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); __set_phys_to_machine(pfn, page->index); if (!PageHighMem(page)) set_pte_at(&init_mm, address, ptep, pfn_pte(pfn, PAGE_KERNEL)); /* No tlb flush necessary because the caller already * left the pte unmapped. */ return 0; } struct page *m2p_find_override(unsigned long mfn) { unsigned long flags; struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; struct page *p, *ret; ret = NULL; spin_lock_irqsave(&m2p_override_lock, flags); list_for_each_entry(p, bucket, lru) { if (p->private == mfn) { ret = p; break; } } spin_unlock_irqrestore(&m2p_override_lock, flags); return ret; } unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) { struct page *p = m2p_find_override(mfn); unsigned long ret = pfn; if (p) ret = page_to_pfn(p); return ret; } EXPORT_SYMBOL_GPL(m2p_find_override_pfn);