aboutsummaryrefslogtreecommitdiff
path: root/mm/hmm.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hmm.c')
-rw-r--r--mm/hmm.c146
1 files changed, 98 insertions, 48 deletions
diff --git a/mm/hmm.c b/mm/hmm.c
index c968e49f7a0c..90c34f3d1243 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * Authors: Jérôme Glisse <jglisse@redhat.com>
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
*/
/*
* Refer to include/linux/hmm.h for information about heterogeneous memory
@@ -43,7 +43,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
*
* @mm: mm struct this HMM struct is bound to
* @lock: lock protecting ranges list
- * @sequence: we track updates to the CPU page table with a sequence number
* @ranges: list of range being snapshotted
* @mirrors: list of mirrors for this mm
* @mmu_notifier: mmu notifier to track updates to CPU page table
@@ -52,7 +51,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
struct hmm {
struct mm_struct *mm;
spinlock_t lock;
- atomic_t sequence;
struct list_head ranges;
struct list_head mirrors;
struct mmu_notifier mmu_notifier;
@@ -85,22 +83,11 @@ static struct hmm *hmm_register(struct mm_struct *mm)
return NULL;
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
- atomic_set(&hmm->sequence, 0);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
spin_lock_init(&hmm->lock);
hmm->mm = mm;
- /*
- * We should only get here if hold the mmap_sem in write mode ie on
- * registration of first mirror through hmm_mirror_register()
- */
- hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
- if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
- kfree(hmm);
- return NULL;
- }
-
spin_lock(&mm->page_table_lock);
if (!mm->hmm)
mm->hmm = hmm;
@@ -108,12 +95,27 @@ static struct hmm *hmm_register(struct mm_struct *mm)
cleanup = true;
spin_unlock(&mm->page_table_lock);
- if (cleanup) {
- mmu_notifier_unregister(&hmm->mmu_notifier, mm);
- kfree(hmm);
- }
+ if (cleanup)
+ goto error;
+
+ /*
+ * We should only get here if hold the mmap_sem in write mode ie on
+ * registration of first mirror through hmm_mirror_register()
+ */
+ hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+ if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
+ goto error_mm;
return mm->hmm;
+
+error_mm:
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
+error:
+ kfree(hmm);
+ return NULL;
}
void hmm_mm_destroy(struct mm_struct *mm)
@@ -121,10 +123,8 @@ void hmm_mm_destroy(struct mm_struct *mm)
kfree(mm->hmm);
}
-static void hmm_invalidate_range(struct hmm *hmm,
- enum hmm_update_type action,
- unsigned long start,
- unsigned long end)
+static int hmm_invalidate_range(struct hmm *hmm, bool device,
+ const struct hmm_update *update)
{
struct hmm_mirror *mirror;
struct hmm_range *range;
@@ -133,22 +133,33 @@ static void hmm_invalidate_range(struct hmm *hmm,
list_for_each_entry(range, &hmm->ranges, list) {
unsigned long addr, idx, npages;
- if (end < range->start || start >= range->end)
+ if (update->end < range->start || update->start >= range->end)
continue;
range->valid = false;
- addr = max(start, range->start);
+ addr = max(update->start, range->start);
idx = (addr - range->start) >> PAGE_SHIFT;
- npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
+ npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
}
spin_unlock(&hmm->lock);
+ if (!device)
+ return 0;
+
down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list)
- mirror->ops->sync_cpu_device_pagetables(mirror, action,
- start, end);
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ int ret;
+
+ ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
+ if (!update->blockable && ret == -EAGAIN) {
+ up_read(&hmm->mirrors_sem);
+ return -EAGAIN;
+ }
+ }
up_read(&hmm->mirrors_sem);
+
+ return 0;
}
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
@@ -178,18 +189,21 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ bool blockable)
{
+ struct hmm_update update;
struct hmm *hmm = mm->hmm;
VM_BUG_ON(!hmm);
- atomic_inc(&hmm->sequence);
-
- return 0;
+ update.start = start;
+ update.end = end;
+ update.event = HMM_UPDATE_INVALIDATE;
+ update.blockable = blockable;
+ return hmm_invalidate_range(hmm, true, &update);
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
@@ -197,11 +211,16 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
unsigned long start,
unsigned long end)
{
+ struct hmm_update update;
struct hmm *hmm = mm->hmm;
VM_BUG_ON(!hmm);
- hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+ update.start = start;
+ update.end = end;
+ update.event = HMM_UPDATE_INVALIDATE;
+ update.blockable = true;
+ hmm_invalidate_range(hmm, false, &update);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
@@ -278,12 +297,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
if (!should_unregister || mm == NULL)
return;
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
+
spin_lock(&mm->page_table_lock);
if (mm->hmm == hmm)
mm->hmm = NULL;
spin_unlock(&mm->page_table_lock);
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
kfree(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
@@ -571,22 +591,42 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
pte_t *ptep;
+ pmd_t pmd;
- i = (addr - range->start) >> PAGE_SHIFT;
again:
- if (pmd_none(*pmdp))
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, walk);
- if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
+ if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
return hmm_pfns_bad(start, end, walk);
- if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
- pmd_t pmd;
+ if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
+ bool fault, write_fault;
+ unsigned long npages;
+ uint64_t *pfns;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ 0, &fault, &write_fault);
+ if (fault || write_fault) {
+ hmm_vma_walk->last = addr;
+ pmd_migration_entry_wait(vma->vm_mm, pmdp);
+ return -EAGAIN;
+ }
+ return 0;
+ } else if (!pmd_present(pmd))
+ return hmm_pfns_bad(start, end, walk);
+
+ if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
* No need to take pmd_lock here, even if some other threads
* is splitting the huge pmd we will get that event through
@@ -601,13 +641,21 @@ again:
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
+ i = (addr - range->start) >> PAGE_SHIFT;
return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
}
- if (pmd_bad(*pmdp))
+ /*
+ * We have handled all the valid case above ie either none, migration,
+ * huge or transparent huge. At this point either it is a valid pmd
+ * entry pointing to pte directory or it is a bad pmd that will not
+ * recover.
+ */
+ if (pmd_bad(pmd))
return hmm_pfns_bad(start, end, walk);
ptep = pte_offset_map(pmdp, addr);
+ i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
int r;
@@ -1024,7 +1072,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
resource_size_t key, align_start, align_size, align_end;
struct device *device = devmem->device;
int ret, nid, is_ram;
- unsigned long pfn;
align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
align_size = ALIGN(devmem->resource->start +
@@ -1109,11 +1156,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
align_size >> PAGE_SHIFT, NULL);
mem_hotplug_done();
- for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
- struct page *page = pfn_to_page(pfn);
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, &devmem->pagemap);
- page->pgmap = &devmem->pagemap;
- }
return 0;
error_add_memory: