16 files changed, 1201 insertions, 338 deletions
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index cfa44a6adc0a..50ff9ba3a236 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -24,8 +24,6 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 
-#include "proc-macros.S"
-
 /*
  *	flush_icache_range(start,end)
  *
@@ -81,26 +79,32 @@ ENDPROC(__flush_cache_user_range)
 /*
  *	__flush_dcache_area(kaddr, size)
  *
- *	Ensure that the data held in the page kaddr is written back to the
- *	page in question.
+ *	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ *	are cleaned and invalidated to the PoC.
  *
  *	- kaddr   - kernel address
  *	- size    - size in question
  */
 ENTRY(__flush_dcache_area)
-	dcache_line_size x2, x3
-	add	x1, x0, x1
-	sub	x3, x2, #1
-	bic	x0, x0, x3
-1:	dc	civac, x0			// clean & invalidate D line / unified line
-	add	x0, x0, x2
-	cmp	x0, x1
-	b.lo	1b
-	dsb	sy
+	dcache_by_line_op civac, sy, x0, x1, x2, x3
 	ret
 ENDPIPROC(__flush_dcache_area)
 
 /*
+ *	__clean_dcache_area_pou(kaddr, size)
+ *
+ * 	Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ * 	are cleaned to the PoU.
+ *
+ *	- kaddr   - kernel address
+ *	- size    - size in question
+ */
+ENTRY(__clean_dcache_area_pou)
+	dcache_by_line_op cvau, ish, x0, x1, x2, x3
+	ret
+ENDPROC(__clean_dcache_area_pou)
+
+/*
  *	__inval_cache_range(start, end)
  *	- start   - start address of region
  *	- end     - end address of region
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index e87f53ff5f58..7275628ba59f 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -187,7 +187,7 @@ switch_mm_fastpath:
 
 static int asids_init(void)
 {
-	int fld = cpuid_feature_extract_field(read_cpuid(ID_AA64MMFR0_EL1), 4);
+	int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4);
 
 	switch (fld) {
 	default:
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 13bbc3be6f5a..22e4cb4d6f53 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -24,8 +24,9 @@
 
 void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr)
 {
+	struct page *page = virt_to_page(kto);
 	copy_page(kto, kfrom);
-	__flush_dcache_area(kto, PAGE_SIZE);
+	flush_dcache_page(page);
 }
 EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
 
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 354144e33218..a6e757cbab77 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -40,7 +40,7 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot,
 static struct gen_pool *atomic_pool;
 
 #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
 
 static int __init early_coherent_pool(char *p)
 {
@@ -896,7 +896,7 @@ static int __iommu_attach_notifier(struct notifier_block *nb,
 	return 0;
 }
 
-static int register_iommu_dma_ops_notifier(struct bus_type *bus)
+static int __init register_iommu_dma_ops_notifier(struct bus_type *bus)
 {
 	struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL);
 	int ret;
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 5a22a119a74c..6be918478f85 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -35,7 +35,9 @@ struct addr_marker {
 };
 
 enum address_markers_idx {
-	VMALLOC_START_NR = 0,
+	MODULES_START_NR = 0,
+	MODULES_END_NR,
+	VMALLOC_START_NR,
 	VMALLOC_END_NR,
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	VMEMMAP_START_NR,
@@ -45,12 +47,12 @@ enum address_markers_idx {
 	FIXADDR_END_NR,
 	PCI_START_NR,
 	PCI_END_NR,
-	MODULES_START_NR,
-	MODUELS_END_NR,
 	KERNEL_SPACE_NR,
 };
 
 static struct addr_marker address_markers[] = {
+	{ MODULES_VADDR,	"Modules start" },
+	{ MODULES_END,		"Modules end" },
 	{ VMALLOC_START,	"vmalloc() Area" },
 	{ VMALLOC_END,		"vmalloc() End" },
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -61,9 +63,7 @@ static struct addr_marker address_markers[] = {
 	{ FIXADDR_TOP,		"Fixmap end" },
 	{ PCI_IO_START,		"PCI I/O start" },
 	{ PCI_IO_END,		"PCI I/O end" },
-	{ MODULES_VADDR,	"Modules start" },
-	{ MODULES_END,		"Modules end" },
-	{ PAGE_OFFSET,		"Kernel Mapping" },
+	{ PAGE_OFFSET,		"Linear Mapping" },
 	{ -1,			NULL },
 };
 
@@ -90,6 +90,11 @@ struct prot_bits {
 
 static const struct prot_bits pte_bits[] = {
 	{
+		.mask	= PTE_VALID,
+		.val	= PTE_VALID,
+		.set	= " ",
+		.clear	= "F",
+	}, {
 		.mask	= PTE_USER,
 		.val	= PTE_USER,
 		.set	= "USR",
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 79444279ba8c..81acd4706878 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs)
 
 	fixup = search_exception_tables(instruction_pointer(regs));
 	if (fixup)
-		regs->pc = fixup->fixup;
+		regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
 
 	return fixup != NULL;
 }
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 86485415c5f0..5cc1e418cb79 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -43,6 +43,28 @@
 
 static const char *fault_name(unsigned int esr);
 
+#ifdef CONFIG_KPROBES
+static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
+{
+	int ret = 0;
+
+	/* kprobe_running() needs smp_processor_id() */
+	if (!user_mode(regs)) {
+		preempt_disable();
+		if (kprobe_running() && kprobe_fault_handler(regs, esr))
+			ret = 1;
+		preempt_enable();
+	}
+
+	return ret;
+}
+#else
+static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
+{
+	return 0;
+}
+#endif
+
 /*
  * Dump out the page tables associated with 'addr' in mm 'mm'.
  */
@@ -244,6 +266,14 @@ out:
 	return fault;
 }
 
+static inline int permission_fault(unsigned int esr)
+{
+	unsigned int ec       = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT;
+	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
+
+	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+}
+
 static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 				   struct pt_regs *regs)
 {
@@ -253,6 +283,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
+	if (notify_page_fault(regs, esr))
+		return 0;
+
 	tsk = current;
 	mm  = tsk->mm;
 
@@ -277,12 +310,13 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		mm_flags |= FAULT_FLAG_WRITE;
 	}
 
-	/*
-	 * PAN bit set implies the fault happened in kernel space, but not
-	 * in the arch's user access functions.
-	 */
-	if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT))
-		goto no_context;
+	if (permission_fault(esr) && (addr < USER_DS)) {
+		if (get_fs() == KERNEL_DS)
+			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
+
+		if (!search_exception_tables(regs->pc))
+			die("Accessing user space memory outside uaccess.h routines", regs, esr);
+	}
 
 	/*
 	 * As per x86, we may deadlock here. However, since the kernel only
@@ -609,6 +643,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
 
 	return 0;
 }
+NOKPROBE_SYMBOL(do_debug_exception);
 
 #ifdef CONFIG_ARM64_PAN
 int cpu_enable_pan(void *__unused)
@@ -624,3 +659,17 @@ int cpu_enable_pan(void *__unused)
 	return 0;
 }
 #endif /* CONFIG_ARM64_PAN */
+
+#ifdef CONFIG_ARM64_UAO
+/*
+ * Kernel threads have fs=KERNEL_DS by default, and don't need to call
+ * set_fs(), devtmpfs in particular relies on this behaviour.
+ * We need to enable the feature at runtime (instead of adding it to
+ * PSR_MODE_EL1h) as the feature may not be implemented by the cpu.
+ */
+int cpu_enable_uao(void *__unused)
+{
+	asm(SET_PSTATE_UAO(1));
+	return 0;
+}
+#endif /* CONFIG_ARM64_UAO */
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index c26b804015e8..46649d6e6c5a 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -34,19 +34,24 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 		__flush_icache_all();
 }
 
+static void sync_icache_aliases(void *kaddr, unsigned long len)
+{
+	unsigned long addr = (unsigned long)kaddr;
+
+	if (icache_is_aliasing()) {
+		__clean_dcache_area_pou(kaddr, len);
+		__flush_icache_all();
+	} else {
+		flush_icache_range(addr, addr + len);
+	}
+}
+
 static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 				unsigned long uaddr, void *kaddr,
 				unsigned long len)
 {
-	if (vma->vm_flags & VM_EXEC) {
-		unsigned long addr = (unsigned long)kaddr;
-		if (icache_is_aliasing()) {
-			__flush_dcache_area(kaddr, len);
-			__flush_icache_all();
-		} else {
-			flush_icache_range(addr, addr + len);
-		}
-	}
+	if (vma->vm_flags & VM_EXEC)
+		sync_icache_aliases(kaddr, len);
 }
 
 /*
@@ -74,13 +79,11 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr)
 	if (!page_mapping(page))
 		return;
 
-	if (!test_and_set_bit(PG_dcache_clean, &page->flags)) {
-		__flush_dcache_area(page_address(page),
-				PAGE_SIZE << compound_order(page));
+	if (!test_and_set_bit(PG_dcache_clean, &page->flags))
+		sync_icache_aliases(page_address(page),
+				    PAGE_SIZE << compound_order(page));
+	else if (icache_is_aivivt())
 		__flush_icache_all();
-	} else if (icache_is_aivivt()) {
-		__flush_icache_all();
-	}
 }
 
 /*
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 383b03ff38f8..019f13637fae 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -41,15 +41,261 @@ int pud_huge(pud_t pud)
 #endif
 }
 
+static int find_num_contig(struct mm_struct *mm, unsigned long addr,
+			   pte_t *ptep, pte_t pte, size_t *pgsize)
+{
+	pgd_t *pgd = pgd_offset(mm, addr);
+	pud_t *pud;
+	pmd_t *pmd;
+
+	*pgsize = PAGE_SIZE;
+	if (!pte_cont(pte))
+		return 1;
+	pud = pud_offset(pgd, addr);
+	pmd = pmd_offset(pud, addr);
+	if ((pte_t *)pmd == ptep) {
+		*pgsize = PMD_SIZE;
+		return CONT_PMDS;
+	}
+	return CONT_PTES;
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+			    pte_t *ptep, pte_t pte)
+{
+	size_t pgsize;
+	int i;
+	int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize);
+	unsigned long pfn;
+	pgprot_t hugeprot;
+
+	if (ncontig == 1) {
+		set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
+
+	pfn = pte_pfn(pte);
+	hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+	for (i = 0; i < ncontig; i++) {
+		pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
+			 pte_val(pfn_pte(pfn, hugeprot)));
+		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		ptep++;
+		pfn += pgsize >> PAGE_SHIFT;
+		addr += pgsize;
+	}
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+		      unsigned long addr, unsigned long sz)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pte_t *pte = NULL;
+
+	pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz);
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return NULL;
+
+	if (sz == PUD_SIZE) {
+		pte = (pte_t *)pud;
+	} else if (sz == (PAGE_SIZE * CONT_PTES)) {
+		pmd_t *pmd = pmd_alloc(mm, pud, addr);
+
+		WARN_ON(addr & (sz - 1));
+		/*
+		 * Note that if this code were ever ported to the
+		 * 32-bit arm platform then it will cause trouble in
+		 * the case where CONFIG_HIGHPTE is set, since there
+		 * will be no pte_unmap() to correspond with this
+		 * pte_alloc_map().
+		 */
+		pte = pte_alloc_map(mm, NULL, pmd, addr);
+	} else if (sz == PMD_SIZE) {
+		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
+		    pud_none(*pud))
+			pte = huge_pmd_share(mm, addr, pud);
+		else
+			pte = (pte_t *)pmd_alloc(mm, pud, addr);
+	} else if (sz == (PMD_SIZE * CONT_PMDS)) {
+		pmd_t *pmd;
+
+		pmd = pmd_alloc(mm, pud, addr);
+		WARN_ON(addr & (sz - 1));
+		return (pte_t *)pmd;
+	}
+
+	pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr,
+	       sz, pte, pte_val(*pte));
+	return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd = NULL;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
+	if (!pgd_present(*pgd))
+		return NULL;
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		return NULL;
+
+	if (pud_huge(*pud))
+		return (pte_t *)pud;
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return NULL;
+
+	if (pte_cont(pmd_pte(*pmd))) {
+		pmd = pmd_offset(
+			pud, (addr & CONT_PMD_MASK));
+		return (pte_t *)pmd;
+	}
+	if (pmd_huge(*pmd))
+		return (pte_t *)pmd;
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_present(*pte) && pte_cont(*pte)) {
+		pte = pte_offset_kernel(
+			pmd, (addr & CONT_PTE_MASK));
+		return pte;
+	}
+	return NULL;
+}
+
+pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+			 struct page *page, int writable)
+{
+	size_t pagesize = huge_page_size(hstate_vma(vma));
+
+	if (pagesize == CONT_PTE_SIZE) {
+		entry = pte_mkcont(entry);
+	} else if (pagesize == CONT_PMD_SIZE) {
+		entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
+	} else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
+		pr_warn("%s: unrecognized huge page size 0x%lx\n",
+			__func__, pagesize);
+	}
+	return entry;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr, pte_t *ptep)
+{
+	pte_t pte;
+
+	if (pte_cont(*ptep)) {
+		int ncontig, i;
+		size_t pgsize;
+		pte_t *cpte;
+		bool is_dirty = false;
+
+		cpte = huge_pte_offset(mm, addr);
+		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		/* save the 1st pte to return */
+		pte = ptep_get_and_clear(mm, addr, cpte);
+		for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) {
+			/*
+			 * If HW_AFDBM is enabled, then the HW could
+			 * turn on the dirty bit for any of the page
+			 * in the set, so check them all.
+			 */
+			++cpte;
+			if (pte_dirty(ptep_get_and_clear(mm, addr, cpte)))
+				is_dirty = true;
+		}
+		if (is_dirty)
+			return pte_mkdirty(pte);
+		else
+			return pte;
+	} else {
+		return ptep_get_and_clear(mm, addr, ptep);
+	}
+}
+
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+			       unsigned long addr, pte_t *ptep,
+			       pte_t pte, int dirty)
+{
+	pte_t *cpte;
+
+	if (pte_cont(pte)) {
+		int ncontig, i, changed = 0;
+		size_t pgsize = 0;
+		unsigned long pfn = pte_pfn(pte);
+		/* Select all bits except the pfn */
+		pgprot_t hugeprot =
+			__pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^
+				 pte_val(pte));
+
+		cpte = huge_pte_offset(vma->vm_mm, addr);
+		pfn = pte_pfn(*cpte);
+		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+					  *cpte, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) {
+			changed |= ptep_set_access_flags(vma, addr, cpte,
+							pfn_pte(pfn,
+								hugeprot),
+							dirty);
+			pfn += pgsize >> PAGE_SHIFT;
+		}
+		return changed;
+	} else {
+		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+	}
+}
+
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+			     unsigned long addr, pte_t *ptep)
+{
+	if (pte_cont(*ptep)) {
+		int ncontig, i;
+		pte_t *cpte;
+		size_t pgsize = 0;
+
+		cpte = huge_pte_offset(mm, addr);
+		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
+			ptep_set_wrprotect(mm, addr, cpte);
+	} else {
+		ptep_set_wrprotect(mm, addr, ptep);
+	}
+}
+
+void huge_ptep_clear_flush(struct vm_area_struct *vma,
+			   unsigned long addr, pte_t *ptep)
+{
+	if (pte_cont(*ptep)) {
+		int ncontig, i;
+		pte_t *cpte;
+		size_t pgsize = 0;
+
+		cpte = huge_pte_offset(vma->vm_mm, addr);
+		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+					  *cpte, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
+			ptep_clear_flush(vma, addr, cpte);
+	} else {
+		ptep_clear_flush(vma, addr, ptep);
+	}
+}
+
 static __init int setup_hugepagesz(char *opt)
 {
 	unsigned long ps = memparse(opt, &opt);
+
 	if (ps == PMD_SIZE) {
 		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
 	} else if (ps == PUD_SIZE) {
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
 	} else {
-		pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
+		pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
 		return 0;
 	}
 	return 1;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index efd89ce4533d..d9fb6d80ce0a 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -29,13 +29,19 @@
 #include <linux/gfp.h>
 #include <linux/memblock.h>
 #include <linux/sort.h>
+#include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/dma-mapping.h>
 #include <linux/dma-contiguous.h>
 #include <linux/efi.h>
 #include <linux/swiotlb.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
 
+#include <asm/boot.h>
 #include <asm/fixmap.h>
+#include <asm/kasan.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/memory.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -45,7 +51,13 @@
 
 #include "mm.h"
 
-phys_addr_t memstart_addr __read_mostly = 0;
+/*
+ * We need to be able to catch inadvertent references to memstart_addr
+ * that occur (potentially in generic code) before arm64_memblock_init()
+ * executes, which assigns it its actual value. So use a default value
+ * that cannot be mistaken for a real physical address.
+ */
+s64 memstart_addr __read_mostly = -1;
 phys_addr_t arm64_dma_phys_limit __read_mostly;
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -58,20 +70,156 @@ static int __init early_initrd(char *p)
 	if (*endp == ',') {
 		size = memparse(endp + 1, NULL);
 
-		initrd_start = (unsigned long)__va(start);
-		initrd_end = (unsigned long)__va(start + size);
+		initrd_start = start;
+		initrd_end = start + size;
 	}
 	return 0;
 }
 early_param("initrd", early_initrd);
 #endif
 
+#ifdef CONFIG_KEXEC_CORE
+/*
+ * reserve_crashkernel() - reserves memory for crash kernel
+ *
+ * This function reserves memory area given in "crashkernel=" kernel command
+ * line parameter. The memory reserved is used by dump capture kernel when
+ * primary kernel is crashing.
+ */
+static void __init reserve_crashkernel(void)
+{
+	unsigned long long crash_base, crash_size;
+	int ret;
+
+	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+				&crash_size, &crash_base);
+	/* no crashkernel= or invalid value specified */
+	if (ret || !crash_size)
+		return;
+
+	crash_size = PAGE_ALIGN(crash_size);
+
+	if (crash_base == 0) {
+		/* Current arm64 boot protocol requires 2MB alignment */
+		crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,
+				crash_size, SZ_2M);
+		if (crash_base == 0) {
+			pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+				crash_size);
+			return;
+		}
+	} else {
+		/* User specifies base address explicitly. */
+		if (!memblock_is_region_memory(crash_base, crash_size)) {
+			pr_warn("cannot reserve crashkernel: region is not memory\n");
+			return;
+		}
+
+		if (memblock_is_region_reserved(crash_base, crash_size)) {
+			pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
+			return;
+		}
+
+		if (!IS_ALIGNED(crash_base, SZ_2M)) {
+			pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
+			return;
+		}
+	}
+	memblock_reserve(crash_base, crash_size);
+
+	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+		crash_base, crash_base + crash_size, crash_size >> 20);
+
+	crashk_res.start = crash_base;
+	crashk_res.end = crash_base + crash_size - 1;
+}
+
+static void __init kexec_reserve_crashkres_pages(void)
+{
+#ifdef CONFIG_HIBERNATION
+	phys_addr_t addr;
+	struct page *page;
+
+	if (!crashk_res.end)
+		return;
+
+	/*
+	 * To reduce the size of hibernation image, all the pages are
+	 * marked as Reserved initially.
+	 */
+	for (addr = crashk_res.start; addr < (crashk_res.end + 1);
+			addr += PAGE_SIZE) {
+		page = phys_to_page(addr);
+		SetPageReserved(page);
+	}
+#endif
+}
+#else
+static void __init reserve_crashkernel(void)
+{
+}
+
+static void __init kexec_reserve_crashkres_pages(void)
+{
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_DUMP
+static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
+		const char *uname, int depth, void *data)
+{
+	const __be32 *reg;
+	int len;
+
+	if (depth != 1 || strcmp(uname, "chosen") != 0)
+		return 0;
+
+	reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len);
+	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
+		return 1;
+
+	elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, &reg);
+	elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+	return 1;
+}
+
+/*
+ * reserve_elfcorehdr() - reserves memory for elf core header
+ *
+ * This function reserves the memory occupied by an elf core header
+ * described in the device tree. This region contains all the
+ * information about primary kernel's core image and is used by a dump
+ * capture kernel to access the system memory on primary kernel.
+ */
+static void __init reserve_elfcorehdr(void)
+{
+	of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL);
+
+	if (!elfcorehdr_size)
+		return;
+
+	if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) {
+		pr_warn("elfcorehdr is overlapped\n");
+		return;
+	}
+
+	memblock_reserve(elfcorehdr_addr, elfcorehdr_size);
+
+	pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n",
+		elfcorehdr_size >> 10, elfcorehdr_addr);
+}
+#else
+static void __init reserve_elfcorehdr(void)
+{
+}
+#endif /* CONFIG_CRASH_DUMP */
 /*
  * Return the maximum physical address for ZONE_DMA (DMA_BIT_MASK(32)). It
  * currently assumes that for memory starting above 4G, 32-bit devices will
  * use a DMA offset.
  */
-static phys_addr_t max_zone_dma_phys(void)
+static phys_addr_t __init max_zone_dma_phys(void)
 {
 	phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
 	return min(offset + (1ULL << 32), memblock_end_of_DRAM());
@@ -126,11 +274,11 @@ EXPORT_SYMBOL(pfn_valid);
 #endif
 
 #ifndef CONFIG_SPARSEMEM
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
 {
 }
 #else
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
 {
 	struct memblock_region *reg;
 
@@ -157,9 +305,94 @@ static int __init early_mem(char *p)
 }
 early_param("mem", early_mem);
 
+static int __init early_init_dt_scan_usablemem(unsigned long node,
+		const char *uname, int depth, void *data)
+{
+	struct memblock_region *usablemem = data;
+	const __be32 *reg;
+	int len;
+
+	if (depth != 1 || strcmp(uname, "chosen") != 0)
+		return 0;
+
+	reg = of_get_flat_dt_prop(node, "linux,usable-memory-range", &len);
+	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
+		return 1;
+
+	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
+	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+	return 1;
+}
+
+static void __init fdt_enforce_memory_region(void)
+{
+	struct memblock_region reg = {
+		.size = 0,
+	};
+
+	of_scan_flat_dt(early_init_dt_scan_usablemem, &reg);
+
+	if (reg.size)
+		memblock_cap_memory_range(reg.base, reg.size);
+}
+
 void __init arm64_memblock_init(void)
 {
-	memblock_enforce_memory_limit(memory_limit);
+	const s64 linear_region_size = -(s64)PAGE_OFFSET;
+
+	/* Handle linux,usable-memory-range property */
+	fdt_enforce_memory_region();
+
+	/*
+	 * Ensure that the linear region takes up exactly half of the kernel
+	 * virtual address space. This way, we can distinguish a linear address
+	 * from a kernel/module/vmalloc address by testing a single bit.
+	 */
+	BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1));
+
+	/*
+	 * Select a suitable value for the base of physical memory.
+	 */
+	memstart_addr = round_down(memblock_start_of_DRAM(),
+				   ARM64_MEMSTART_ALIGN);
+
+	/*
+	 * Remove the memory that we will not be able to cover with the
+	 * linear mapping. Take care not to clip the kernel which may be
+	 * high in memory.
+	 */
+	memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)),
+			ULLONG_MAX);
+	if (memblock_end_of_DRAM() > linear_region_size)
+		memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);
+
+	/*
+	 * Apply the memory limit if it was set. Since the kernel may be loaded
+	 * high up in memory, add back the kernel region that must be accessible
+	 * via the linear mapping.
+	 */
+	if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+		memblock_enforce_memory_limit(memory_limit);
+		memblock_add(__pa(_text), (u64)(_end - _text));
+	}
+
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		extern u16 memstart_offset_seed;
+		u64 range = linear_region_size -
+			    (memblock_end_of_DRAM() - memblock_start_of_DRAM());
+
+		/*
+		 * If the size of the linear region exceeds, by a sufficient
+		 * margin, the size of the region that the available physical
+		 * memory spans, randomize the linear region as well.
+		 */
+		if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
+			range = range / ARM64_MEMSTART_ALIGN + 1;
+			memstart_addr -= ARM64_MEMSTART_ALIGN *
+					 ((range * memstart_offset_seed) >> 16);
+		}
+	}
 
 	/*
 	 * Register the kernel text, kernel data, initrd, and initial
@@ -167,8 +400,13 @@ void __init arm64_memblock_init(void)
 	 */
 	memblock_reserve(__pa(_text), _end - _text);
 #ifdef CONFIG_BLK_DEV_INITRD
-	if (initrd_start)
-		memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start);
+	if (initrd_start) {
+		memblock_reserve(initrd_start, initrd_end - initrd_start);
+
+		/* the generic initrd code expects virtual addresses */
+		initrd_start = __phys_to_virt(initrd_start);
+		initrd_end = __phys_to_virt(initrd_end);
+	}
 #endif
 
 	early_init_fdt_scan_reserved_mem();
@@ -178,6 +416,11 @@ void __init arm64_memblock_init(void)
 		arm64_dma_phys_limit = max_zone_dma_phys();
 	else
 		arm64_dma_phys_limit = PHYS_MASK + 1;
+
+	reserve_crashkernel();
+
+	reserve_elfcorehdr();
+
 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
 	dma_contiguous_reserve(arm64_dma_phys_limit);
 
@@ -291,6 +534,8 @@ void __init mem_init(void)
 	/* this will put all unused low memory onto the freelists */
 	free_all_bootmem();
 
+	kexec_reserve_crashkres_pages();
+
 	mem_init_print_info(NULL);
 
 #define MLK(b, t) b, t, ((t) - (b)) >> 10
@@ -302,35 +547,38 @@ void __init mem_init(void)
 #ifdef CONFIG_KASAN
 		  "    kasan   : 0x%16lx - 0x%16lx   (%6ld GB)\n"
 #endif
+		  "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
 		  "    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n"
+		  "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+		  "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+		  "    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+		  "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n"
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 		  "    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n"
 		  "              0x%16lx - 0x%16lx   (%6ld MB actual)\n"
 #endif
 		  "    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n"
 		  "    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-		  "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-		  "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-		  "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-		  "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-		  "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+		  "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
 #ifdef CONFIG_KASAN
 		  MLG(KASAN_SHADOW_START, KASAN_SHADOW_END),
 #endif
+		  MLM(MODULES_VADDR, MODULES_END),
 		  MLG(VMALLOC_START, VMALLOC_END),
+		  MLK_ROUNDUP(__init_begin, __init_end),
+		  MLK_ROUNDUP(_text, __start_rodata),
+		  MLK_ROUNDUP(__start_rodata, _etext),
+		  MLK_ROUNDUP(_sdata, _edata),
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 		  MLG(VMEMMAP_START,
 		      VMEMMAP_START + VMEMMAP_SIZE),
-		  MLM((unsigned long)virt_to_page(PAGE_OFFSET),
+		  MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
 		      (unsigned long)virt_to_page(high_memory)),
 #endif
 		  MLK(FIXADDR_START, FIXADDR_TOP),
 		  MLM(PCI_IO_START, PCI_IO_END),
-		  MLM(MODULES_VADDR, MODULES_END),
-		  MLM(PAGE_OFFSET, (unsigned long)high_memory),
-		  MLK_ROUNDUP(__init_begin, __init_end),
-		  MLK_ROUNDUP(_text, _etext),
-		  MLK_ROUNDUP(_sdata, _edata));
+		  MLM(__phys_to_virt(memblock_start_of_DRAM()),
+		      (unsigned long)high_memory));
 
 #undef MLK
 #undef MLM
@@ -358,9 +606,8 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-	fixup_init();
 	free_initmem_default(0);
-	free_alternatives_memory();
+	fixup_init();
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -381,3 +628,28 @@ static int __init keepinitrd_setup(char *__unused)
 
 __setup("keepinitrd", keepinitrd_setup);
 #endif
+
+/*
+ * Dump out memory limit information on panic.
+ */
+static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p)
+{
+	if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+		pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
+	} else {
+		pr_emerg("Memory Limit: none\n");
+	}
+	return 0;
+}
+
+static struct notifier_block mem_limit_notifier = {
+	.notifier_call = dump_mem_limit,
+};
+
+static int __init register_mem_limit_dumper(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &mem_limit_notifier);
+	return 0;
+}
+__initcall(register_mem_limit_dumper);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index cf038c7d9fa9..757009daa9ed 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -16,9 +16,12 @@
 #include <linux/memblock.h>
 #include <linux/start_kernel.h>
 
+#include <asm/mmu_context.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
+#include <asm/sections.h>
 #include <asm/tlbflush.h>
 
 static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
@@ -32,7 +35,7 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
 	if (pmd_none(*pmd))
 		pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
 
-	pte = pte_offset_kernel(pmd, addr);
+	pte = pte_offset_kimg(pmd, addr);
 	do {
 		next = addr + PAGE_SIZE;
 		set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page),
@@ -50,7 +53,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud,
 	if (pud_none(*pud))
 		pud_populate(&init_mm, pud, kasan_zero_pmd);
 
-	pmd = pmd_offset(pud, addr);
+	pmd = pmd_offset_kimg(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		kasan_early_pte_populate(pmd, addr, next);
@@ -67,7 +70,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd,
 	if (pgd_none(*pgd))
 		pgd_populate(&init_mm, pgd, kasan_zero_pud);
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_offset_kimg(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		kasan_early_pmd_populate(pud, addr, next);
@@ -96,6 +99,21 @@ asmlinkage void __init kasan_early_init(void)
 	kasan_map_early_shadow();
 }
 
+/*
+ * Copy the current shadow region into a new pgdir.
+ */
+void __init kasan_copy_shadow(pgd_t *pgdir)
+{
+	pgd_t *pgd, *pgd_new, *pgd_end;
+
+	pgd = pgd_offset_k(KASAN_SHADOW_START);
+	pgd_end = pgd_offset_k(KASAN_SHADOW_END);
+	pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START);
+	do {
+		set_pgd(pgd_new, *pgd);
+	} while (pgd++, pgd_new++, pgd != pgd_end);
+}
+
 static void __init clear_pgds(unsigned long start,
 			unsigned long end)
 {
@@ -108,18 +126,18 @@ static void __init clear_pgds(unsigned long start,
 		set_pgd(pgd_offset_k(start), __pgd(0));
 }
 
-static void __init cpu_set_ttbr1(unsigned long ttbr1)
-{
-	asm(
-	"	msr	ttbr1_el1, %0\n"
-	"	isb"
-	:
-	: "r" (ttbr1));
-}
-
 void __init kasan_init(void)
 {
+	u64 kimg_shadow_start, kimg_shadow_end;
+	u64 mod_shadow_start, mod_shadow_end;
 	struct memblock_region *reg;
+	int i;
+
+	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text);
+	kimg_shadow_end = (u64)kasan_mem_to_shadow(_end);
+
+	mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR);
+	mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END);
 
 	/*
 	 * We are going to perform proper setup of shadow memory.
@@ -129,13 +147,33 @@ void __init kasan_init(void)
 	 * setup will be finished.
 	 */
 	memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
-	cpu_set_ttbr1(__pa(tmp_pg_dir));
-	flush_tlb_all();
+	dsb(ishst);
+	cpu_replace_ttbr1(tmp_pg_dir);
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
+	vmemmap_populate(kimg_shadow_start, kimg_shadow_end,
+			 pfn_to_nid(virt_to_pfn(_text)));
+
+	/*
+	 * vmemmap_populate() has populated the shadow region that covers the
+	 * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round
+	 * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent
+	 * kasan_populate_zero_shadow() from replacing the page table entries
+	 * (PMD or PTE) at the edges of the shadow region for the kernel
+	 * image.
+	 */
+	kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE);
+	kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
+
 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
-			kasan_mem_to_shadow((void *)MODULES_VADDR));
+				   (void *)mod_shadow_start);
+	kasan_populate_zero_shadow((void *)kimg_shadow_end,
+				   kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+	if (kimg_shadow_start > mod_shadow_end)
+		kasan_populate_zero_shadow((void *)mod_shadow_end,
+					   (void *)kimg_shadow_start);
 
 	for_each_memblock(memory, reg) {
 		void *start = (void *)__phys_to_virt(reg->base);
@@ -155,9 +193,16 @@ void __init kasan_init(void)
 				pfn_to_nid(virt_to_pfn(start)));
 	}
 
+	/*
+	 * KAsan may reuse the contents of kasan_zero_pte directly, so we
+	 * should make sure that it maps the zero page read-only.
+	 */
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		set_pte(&kasan_zero_pte[i],
+			pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
+
 	memset(kasan_zero_page, 0, PAGE_SIZE);
-	cpu_set_ttbr1(__pa(swapper_pg_dir));
-	flush_tlb_all();
+	cpu_replace_ttbr1(swapper_pg_dir);
 
 	/* At this point kasan is fully initialized. Enable error messages */
 	init_task.kasan_depth = 0;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 51ac84e0812d..233e0a912247 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -21,6 +21,8 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kexec.h>
 #include <linux/libfdt.h>
 #include <linux/mman.h>
 #include <linux/nodemask.h>
@@ -30,8 +32,10 @@
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
 
+#include <asm/barrier.h>
 #include <asm/cputype.h>
 #include <asm/fixmap.h>
+#include <asm/kasan.h>
 #include <asm/kernel-pgtable.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -44,13 +48,20 @@
 
 u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
 
+u64 kimage_voffset __read_mostly;
+EXPORT_SYMBOL(kimage_voffset);
+
 /*
  * Empty_zero_page is a special page that is used for zero-initialized data
  * and COW.
  */
-struct page *empty_zero_page;
+unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
 
+static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
+static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
+static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
+
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 			      unsigned long size, pgprot_t vma_prot)
 {
@@ -62,16 +73,30 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
 
-static void __init *early_alloc(unsigned long sz)
+static phys_addr_t __init early_pgtable_alloc(void)
 {
 	phys_addr_t phys;
 	void *ptr;
 
-	phys = memblock_alloc(sz, sz);
+	phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 	BUG_ON(!phys);
-	ptr = __va(phys);
-	memset(ptr, 0, sz);
-	return ptr;
+
+	/*
+	 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
+	 * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
+	 * any level of table.
+	 */
+	ptr = pte_set_fixmap(phys);
+
+	memset(ptr, 0, PAGE_SIZE);
+
+	/*
+	 * Implicit barriers also ensure the zeroed page is visible to the page
+	 * table walker
+	 */
+	pte_clear_fixmap();
+
+	return phys;
 }
 
 /*
@@ -95,24 +120,30 @@ static void split_pmd(pmd_t *pmd, pte_t *pte)
 static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
 				  unsigned long end, unsigned long pfn,
 				  pgprot_t prot,
-				  void *(*alloc)(unsigned long size))
+				  phys_addr_t (*pgtable_alloc)(void))
 {
 	pte_t *pte;
 
 	if (pmd_none(*pmd) || pmd_sect(*pmd)) {
-		pte = alloc(PTRS_PER_PTE * sizeof(pte_t));
+		phys_addr_t pte_phys;
+		BUG_ON(!pgtable_alloc);
+		pte_phys = pgtable_alloc();
+		pte = pte_set_fixmap(pte_phys);
 		if (pmd_sect(*pmd))
 			split_pmd(pmd, pte);
-		__pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE);
+		__pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
 		flush_tlb_all();
+		pte_clear_fixmap();
 	}
 	BUG_ON(pmd_bad(*pmd));
 
-	pte = pte_offset_kernel(pmd, addr);
+	pte = pte_set_fixmap_offset(pmd, addr);
 	do {
 		set_pte(pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	pte_clear_fixmap();
 }
 
 static void split_pud(pud_t *old_pud, pmd_t *pmd)
@@ -127,10 +158,10 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd)
 	} while (pmd++, i++, i < PTRS_PER_PMD);
 }
 
-static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
-				  unsigned long addr, unsigned long end,
+static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
-				  void *(*alloc)(unsigned long size))
+				  phys_addr_t (*pgtable_alloc)(void),
+				  bool allow_block_mappings)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -139,7 +170,10 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 	 * Check for initial section mappings in the pgd/pud and remove them.
 	 */
 	if (pud_none(*pud) || pud_sect(*pud)) {
-		pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t));
+		phys_addr_t pmd_phys;
+		BUG_ON(!pgtable_alloc);
+		pmd_phys = pgtable_alloc();
+		pmd = pmd_set_fixmap(pmd_phys);
 		if (pud_sect(*pud)) {
 			/*
 			 * need to have the 1G of mappings continue to be
@@ -147,19 +181,20 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 			 */
 			split_pud(pud, pmd);
 		}
-		pud_populate(mm, pud, pmd);
+		__pud_populate(pud, pmd_phys, PUD_TYPE_TABLE);
 		flush_tlb_all();
+		pmd_clear_fixmap();
 	}
 	BUG_ON(pud_bad(*pud));
 
-	pmd = pmd_offset(pud, addr);
+	pmd = pmd_set_fixmap_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		/* try section mapping first */
-		if (((addr | next | phys) & ~SECTION_MASK) == 0) {
+		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
+		      (!pgtable_alloc || allow_block_mappings)) {
 			pmd_t old_pmd =*pmd;
-			set_pmd(pmd, __pmd(phys |
-					   pgprot_val(mk_sect_prot(prot))));
+			pmd_set_huge(pmd, phys, prot);
 			/*
 			 * Check for previous table entries created during
 			 * boot (__create_page_tables) and flush them.
@@ -167,17 +202,19 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 			if (!pmd_none(old_pmd)) {
 				flush_tlb_all();
 				if (pmd_table(old_pmd)) {
-					phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0));
+					phys_addr_t table = pmd_page_paddr(old_pmd);
 					if (!WARN_ON_ONCE(slab_is_available()))
 						memblock_free(table, PAGE_SIZE);
 				}
 			}
 		} else {
 			alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys),
-				       prot, alloc);
+				       prot, pgtable_alloc);
 		}
 		phys += next - addr;
 	} while (pmd++, addr = next, addr != end);
+
+	pmd_clear_fixmap();
 }
 
 static inline bool use_1G_block(unsigned long addr, unsigned long next,
@@ -192,31 +229,33 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next,
 	return true;
 }
 
-static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
-				  unsigned long addr, unsigned long end,
+static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 				  phys_addr_t phys, pgprot_t prot,
-				  void *(*alloc)(unsigned long size))
+				  phys_addr_t (*pgtable_alloc)(void),
+				  bool allow_block_mappings)
 {
 	pud_t *pud;
 	unsigned long next;
 
 	if (pgd_none(*pgd)) {
-		pud = alloc(PTRS_PER_PUD * sizeof(pud_t));
-		pgd_populate(mm, pgd, pud);
+		phys_addr_t pud_phys;
+		BUG_ON(!pgtable_alloc);
+		pud_phys = pgtable_alloc();
+		__pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE);
 	}
 	BUG_ON(pgd_bad(*pgd));
 
-	pud = pud_offset(pgd, addr);
+	pud = pud_set_fixmap_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 
 		/*
 		 * For 4K granule only, attempt to put down a 1GB block
 		 */
-		if (use_1G_block(addr, next, phys)) {
+		if (use_1G_block(addr, next, phys) &&
+		      (!pgtable_alloc || allow_block_mappings)) {
 			pud_t old_pud = *pud;
-			set_pud(pud, __pud(phys |
-					   pgprot_val(mk_sect_prot(prot))));
+			pud_set_huge(pud, phys, prot);
 
 			/*
 			 * If we have an old value for a pud, it will
@@ -228,51 +267,78 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 			if (!pud_none(old_pud)) {
 				flush_tlb_all();
 				if (pud_table(old_pud)) {
-					phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
+					phys_addr_t table = pud_page_paddr(old_pud);
 					if (!WARN_ON_ONCE(slab_is_available()))
 						memblock_free(table, PAGE_SIZE);
 				}
 			}
 		} else {
-			alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc);
+			alloc_init_pmd(pud, addr, next, phys, prot,
+				       pgtable_alloc, allow_block_mappings);
 		}
 		phys += next - addr;
 	} while (pud++, addr = next, addr != end);
+
+	pud_clear_fixmap();
 }
 
 /*
  * Create the page directory entries and any necessary page tables for the
  * mapping specified by 'md'.
  */
-static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
-				    phys_addr_t phys, unsigned long virt,
+static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt,
 				    phys_addr_t size, pgprot_t prot,
-				    void *(*alloc)(unsigned long size))
+				    phys_addr_t (*pgtable_alloc)(void),
+				    bool allow_block_mappings)
 {
 	unsigned long addr, length, end, next;
 
+	/*
+	 * If the virtual and physical address don't have the same offset
+	 * within a page, we cannot map the region as the caller expects.
+	 */
+	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
+		return;
+
+	phys &= PAGE_MASK;
 	addr = virt & PAGE_MASK;
 	length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
 
 	end = addr + length;
 	do {
 		next = pgd_addr_end(addr, end);
-		alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc);
+		alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc,
+			      (!pgtable_alloc || allow_block_mappings));
 		phys += next - addr;
 	} while (pgd++, addr = next, addr != end);
 }
 
-static void *late_alloc(unsigned long size)
+static phys_addr_t late_pgtable_alloc(void)
 {
-	void *ptr;
-
-	BUG_ON(size > PAGE_SIZE);
-	ptr = (void *)__get_free_page(PGALLOC_GFP);
+	void *ptr = (void *)__get_free_page(PGALLOC_GFP);
 	BUG_ON(!ptr);
-	return ptr;
+
+	/* Ensure the zeroed page is visible to the page table walker */
+	dsb(ishst);
+	return __pa(ptr);
 }
 
-static void __init create_mapping(phys_addr_t phys, unsigned long virt,
+static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*alloc)(void),
+				 bool allow_block_mappings)
+{
+	init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc,
+		 allow_block_mappings);
+}
+
+/*
+ * This function can only be used to modify existing table entries,
+ * without allocating new levels of table. Note that this permits the
+ * creation of new section or page entries.
+ */
+static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 				  phys_addr_t size, pgprot_t prot)
 {
 	if (virt < VMALLOC_START) {
@@ -280,16 +346,15 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt,
 			&phys, virt);
 		return;
 	}
-	__create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt,
-			 size, prot, early_alloc);
+	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, true);
 }
 
 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       unsigned long virt, phys_addr_t size,
-			       pgprot_t prot)
+			       pgprot_t prot, bool allow_block_mappings)
 {
-	__create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot,
-				late_alloc);
+	__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
+			     late_pgtable_alloc, allow_block_mappings);
 }
 
 static void create_mapping_late(phys_addr_t phys, unsigned long virt,
@@ -301,69 +366,36 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
 		return;
 	}
 
-	return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK),
-				phys, virt, size, prot, late_alloc);
+	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
+			     late_pgtable_alloc, !debug_pagealloc_enabled());
 }
 
-#ifdef CONFIG_DEBUG_RODATA
-static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
-{
-	/*
-	 * Set up the executable regions using the existing section mappings
-	 * for now. This will get more fine grained later once all memory
-	 * is mapped
-	 */
-	unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE);
-	unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE);
-
-	if (end < kernel_x_start) {
-		create_mapping(start, __phys_to_virt(start),
-			end - start, PAGE_KERNEL);
-	} else if (start >= kernel_x_end) {
-		create_mapping(start, __phys_to_virt(start),
-			end - start, PAGE_KERNEL);
-	} else {
-		if (start < kernel_x_start)
-			create_mapping(start, __phys_to_virt(start),
-				kernel_x_start - start,
-				PAGE_KERNEL);
-		create_mapping(kernel_x_start,
-				__phys_to_virt(kernel_x_start),
-				kernel_x_end - kernel_x_start,
-				PAGE_KERNEL_EXEC);
-		if (kernel_x_end < end)
-			create_mapping(kernel_x_end,
-				__phys_to_virt(kernel_x_end),
-				end - kernel_x_end,
-				PAGE_KERNEL);
-	}
-
-}
-#else
-static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
+static void __init __map_memblock(pgd_t *pgd, phys_addr_t start,
+				  phys_addr_t end, pgprot_t prot,
+				  bool allow_block_mappings)
 {
-	create_mapping(start, __phys_to_virt(start), end - start,
-			PAGE_KERNEL_EXEC);
+	__create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start,
+			     prot, early_pgtable_alloc, allow_block_mappings);
 }
-#endif
 
-static void __init map_mem(void)
+static void __init map_mem(pgd_t *pgd)
 {
+	unsigned long kernel_start = __pa(_text);
+	unsigned long kernel_end = __pa(_etext);
 	struct memblock_region *reg;
-	phys_addr_t limit;
 
 	/*
-	 * Temporarily limit the memblock range. We need to do this as
-	 * create_mapping requires puds, pmds and ptes to be allocated from
-	 * memory addressable from the initial direct kernel mapping.
-	 *
-	 * The initial direct kernel mapping, located at swapper_pg_dir, gives
-	 * us PUD_SIZE (with SECTION maps) or PMD_SIZE (without SECTION maps,
-	 * memory starting from PHYS_OFFSET (which must be aligned to 2MB as
-	 * per Documentation/arm64/booting.txt).
+	 * Take care not to create a writable alias for the
+	 * read-only text and rodata sections of the kernel image.
+	 * So temporarily mark them as NOMAP to skip mappings in
+	 * the following for-loop
 	 */
-	limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE;
-	memblock_set_current_limit(limit);
+	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
+#ifdef CONFIG_KEXEC_CORE
+	if (crashk_res.end)
+		memblock_mark_nomap(crashk_res.start,
+				    resource_size(&crashk_res));
+#endif
 
 	/* map all the memory banks */
 	for_each_memblock(memory, reg) {
@@ -372,70 +404,122 @@ static void __init map_mem(void)
 
 		if (start >= end)
 			break;
+		if (memblock_is_nomap(reg))
+			continue;
 
-		if (ARM64_SWAPPER_USES_SECTION_MAPS) {
-			/*
-			 * For the first memory bank align the start address and
-			 * current memblock limit to prevent create_mapping() from
-			 * allocating pte page tables from unmapped memory. With
-			 * the section maps, if the first block doesn't end on section
-			 * size boundary, create_mapping() will try to allocate a pte
-			 * page, which may be returned from an unmapped area.
-			 * When section maps are not used, the pte page table for the
-			 * current limit is already present in swapper_pg_dir.
-			 */
-			if (start < limit)
-				start = ALIGN(start, SECTION_SIZE);
-			if (end < limit) {
-				limit = end & SECTION_MASK;
-				memblock_set_current_limit(limit);
-			}
-		}
-		__map_memblock(start, end);
+		__map_memblock(pgd, start, end,
+			       PAGE_KERNEL, !debug_pagealloc_enabled());
 	}
 
-	/* Limit no longer required. */
-	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-}
-
-static void __init fixup_executable(void)
-{
-#ifdef CONFIG_DEBUG_RODATA
-	/* now that we are actually fully mapped, make the start/end more fine grained */
-	if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) {
-		unsigned long aligned_start = round_down(__pa(_stext),
-							 SWAPPER_BLOCK_SIZE);
-
-		create_mapping(aligned_start, __phys_to_virt(aligned_start),
-				__pa(_stext) - aligned_start,
-				PAGE_KERNEL);
-	}
+	/*
+	 * Map the linear alias of the [_text, _etext) interval as
+	 * read-only/non-executable. This makes the contents of the
+	 * region accessible to subsystems such as hibernate, but
+	 * protects it from inadvertent modification or execution.
+	 */
+	__map_memblock(pgd, kernel_start, kernel_end,
+		       PAGE_KERNEL_RO, !debug_pagealloc_enabled());
+	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
 
-	if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) {
-		unsigned long aligned_end = round_up(__pa(__init_end),
-							  SWAPPER_BLOCK_SIZE);
-		create_mapping(__pa(__init_end), (unsigned long)__init_end,
-				aligned_end - __pa(__init_end),
-				PAGE_KERNEL);
+#ifdef CONFIG_KEXEC_CORE
+	/*
+	 * Use page-level mappings here so that we can shrink the region
+	 * in page granularity and put back unused memory to buddy system
+	 * through /sys/kernel/kexec_crash_size interface.
+	 */
+	if (crashk_res.end) {
+		__map_memblock(pgd, crashk_res.start, crashk_res.end + 1,
+			       PAGE_KERNEL, false);
+		memblock_clear_nomap(crashk_res.start,
+				     resource_size(&crashk_res));
 	}
 #endif
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void)
 {
-	create_mapping_late(__pa(_stext), (unsigned long)_stext,
-				(unsigned long)_etext - (unsigned long)_stext,
-				PAGE_KERNEL_ROX);
+	unsigned long section_size;
 
+	section_size = (unsigned long)__start_rodata - (unsigned long)_text;
+	create_mapping_late(__pa(_text), (unsigned long)_text,
+			    section_size, PAGE_KERNEL_ROX);
+	/*
+	 * mark .rodata as read only. Use _etext rather than __end_rodata to
+	 * cover NOTES and EXCEPTION_TABLE.
+	 */
+	section_size = (unsigned long)_etext - (unsigned long)__start_rodata;
+	create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
+			    section_size, PAGE_KERNEL_RO);
 }
-#endif
 
 void fixup_init(void)
 {
-	create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin,
-			(unsigned long)__init_end - (unsigned long)__init_begin,
-			PAGE_KERNEL);
+	/*
+	 * Unmap the __init region but leave the VM area in place. This
+	 * prevents the region from being reused for kernel modules, which
+	 * is not supported by kallsyms.
+	 */
+	unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
+}
+
+static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
+				      pgprot_t prot, struct vm_struct *vma)
+{
+	phys_addr_t pa_start = __pa(va_start);
+	unsigned long size = va_end - va_start;
+
+	BUG_ON(!PAGE_ALIGNED(pa_start));
+	BUG_ON(!PAGE_ALIGNED(size));
+
+	__create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
+			     early_pgtable_alloc, !debug_pagealloc_enabled());
+
+	vma->addr	= va_start;
+	vma->phys_addr	= pa_start;
+	vma->size	= size;
+	vma->flags	= VM_MAP;
+	vma->caller	= __builtin_return_address(0);
+
+	vm_area_add_early(vma);
+}
+
+/*
+ * Create fine-grained mappings for the kernel.
+ */
+static void __init map_kernel(pgd_t *pgd)
+{
+	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
+
+	map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
+	map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
+	map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
+			   &vmlinux_init);
+	map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
+
+	if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
+		/*
+		 * The fixmap falls in a separate pgd to the kernel, and doesn't
+		 * live in the carveout for the swapper_pg_dir. We can simply
+		 * re-use the existing dir for the fixmap.
+		 */
+		set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
+			*pgd_offset_k(FIXADDR_START));
+	} else if (CONFIG_PGTABLE_LEVELS > 3) {
+		/*
+		 * The fixmap shares its top level pgd entry with the kernel
+		 * mapping. This can really only occur when we are running
+		 * with 16k/4 levels, so we can simply reuse the pud level
+		 * entry instead.
+		 */
+		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
+		set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
+			__pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
+		pud_clear_fixmap();
+	} else {
+		BUG();
+	}
+
+	kasan_copy_shadow(pgd);
 }
 
 /*
@@ -444,28 +528,35 @@ void fixup_init(void)
  */
 void __init paging_init(void)
 {
-	void *zero_page;
+	phys_addr_t pgd_phys = early_pgtable_alloc();
+	pgd_t *pgd = pgd_set_fixmap(pgd_phys);
 
-	map_mem();
-	fixup_executable();
+	map_kernel(pgd);
+	map_mem(pgd);
 
-	/* allocate the zero page. */
-	zero_page = early_alloc(PAGE_SIZE);
-
-	bootmem_init();
-
-	empty_zero_page = virt_to_page(zero_page);
+	/*
+	 * We want to reuse the original swapper_pg_dir so we don't have to
+	 * communicate the new address to non-coherent secondaries in
+	 * secondary_entry, and so cpu_switch_mm can generate the address with
+	 * adrp+add rather than a load from some global variable.
+	 *
+	 * To do this we need to go via a temporary pgd.
+	 */
+	cpu_replace_ttbr1(__va(pgd_phys));
+	memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
+	cpu_replace_ttbr1(swapper_pg_dir);
 
-	/* Ensure the zero page is visible to the page table walker */
-	dsb(ishst);
+	pgd_clear_fixmap();
+	memblock_free(pgd_phys, PAGE_SIZE);
 
 	/*
-	 * TTBR0 is only used for the identity mapping at this stage. Make it
-	 * point to zero page to avoid speculatively fetching new entries.
+	 * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
+	 * allocated with it.
 	 */
-	cpu_set_reserved_ttbr0();
-	local_flush_tlb_all();
-	cpu_set_default_tcr_t0sz();
+	memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
+		      SWAPPER_DIR_SIZE - PAGE_SIZE);
+
+	bootmem_init();
 }
 
 /*
@@ -552,21 +643,13 @@ void vmemmap_free(unsigned long start, unsigned long end)
 }
 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
 
-static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
-#if CONFIG_PGTABLE_LEVELS > 2
-static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
-#endif
-
 static inline pud_t * fixmap_pud(unsigned long addr)
 {
 	pgd_t *pgd = pgd_offset_k(addr);
 
 	BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
 
-	return pud_offset(pgd, addr);
+	return pud_offset_kimg(pgd, addr);
 }
 
 static inline pmd_t * fixmap_pmd(unsigned long addr)
@@ -575,16 +658,12 @@ static inline pmd_t * fixmap_pmd(unsigned long addr)
 
 	BUG_ON(pud_none(*pud) || pud_bad(*pud));
 
-	return pmd_offset(pud, addr);
+	return pmd_offset_kimg(pud, addr);
 }
 
 static inline pte_t * fixmap_pte(unsigned long addr)
 {
-	pmd_t *pmd = fixmap_pmd(addr);
-
-	BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd));
-
-	return pte_offset_kernel(pmd, addr);
+	return &bm_pte[pte_index(addr)];
 }
 
 void __init early_fixmap_init(void)
@@ -595,15 +674,26 @@ void __init early_fixmap_init(void)
 	unsigned long addr = FIXADDR_START;
 
 	pgd = pgd_offset_k(addr);
-	pgd_populate(&init_mm, pgd, bm_pud);
-	pud = pud_offset(pgd, addr);
+	if (CONFIG_PGTABLE_LEVELS > 3 &&
+	    !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) {
+		/*
+		 * We only end up here if the kernel mapping and the fixmap
+		 * share the top level pgd entry, which should only happen on
+		 * 16k/4 levels configurations.
+		 */
+		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
+		pud = pud_offset_kimg(pgd, addr);
+	} else {
+		pgd_populate(&init_mm, pgd, bm_pud);
+		pud = fixmap_pud(addr);
+	}
 	pud_populate(&init_mm, pud, bm_pmd);
-	pmd = pmd_offset(pud, addr);
+	pmd = fixmap_pmd(addr);
 	pmd_populate_kernel(&init_mm, pmd, bm_pte);
 
 	/*
 	 * The boot-ioremap range spans multiple pmds, for which
-	 * we are not preparted:
+	 * we are not prepared:
 	 */
 	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
 		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
@@ -642,11 +732,10 @@ void __set_fixmap(enum fixed_addresses idx,
 	}
 }
 
-void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
+void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 {
 	const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
-	pgprot_t prot = PAGE_KERNEL_RO;
-	int size, offset;
+	int offset;
 	void *dt_virt;
 
 	/*
@@ -663,7 +752,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	/*
 	 * Make sure that the FDT region can be mapped without the need to
 	 * allocate additional translation table pages, so that it is safe
-	 * to call create_mapping() this early.
+	 * to call create_mapping_noalloc() this early.
 	 *
 	 * On 64k pages, the FDT will be mapped using PTEs, so we need to
 	 * be in the same PMD as the rest of the fixmap.
@@ -679,25 +768,77 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	dt_virt = (void *)dt_virt_base + offset;
 
 	/* map the first chunk so we can read the size from the header */
-	create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
-		       SWAPPER_BLOCK_SIZE, prot);
+	create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
+			dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
 
 	if (fdt_magic(dt_virt) != FDT_MAGIC)
 		return NULL;
 
-	size = fdt_totalsize(dt_virt);
-	if (size > MAX_FDT_SIZE)
+	*size = fdt_totalsize(dt_virt);
+	if (*size > MAX_FDT_SIZE)
 		return NULL;
 
-	if (offset + size > SWAPPER_BLOCK_SIZE)
-		create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
-			       round_up(offset + size, SWAPPER_BLOCK_SIZE), prot);
+	if (offset + *size > SWAPPER_BLOCK_SIZE)
+		create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
+			       round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
 
-	memblock_reserve(dt_phys, size);
+	return dt_virt;
+}
+
+void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
+{
+	void *dt_virt;
+	int size;
+
+	dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
+	if (!dt_virt)
+		return NULL;
 
+	memblock_reserve(dt_phys, size);
 	return dt_virt;
 }
 
+int __init arch_ioremap_pud_supported(void)
+{
+	/* only 4k granule supports level 1 block mappings */
+	return IS_ENABLED(CONFIG_ARM64_4K_PAGES);
+}
+
+int __init arch_ioremap_pmd_supported(void)
+{
+	return 1;
+}
+
+int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot)
+{
+	BUG_ON(phys & ~PUD_MASK);
+	set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+	return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot)
+{
+	BUG_ON(phys & ~PMD_MASK);
+	set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+	return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+	if (!pud_sect(*pud))
+		return 0;
+	pud_clear(pud);
+	return 1;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+	if (!pmd_sect(*pmd))
+		return 0;
+	pmd_clear(pmd);
+	return 1;
+}
+
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 int pud_free_pmd_page(pud_t *pud)
 {
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index cf6240741134..f4e39dbdd36b 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -36,14 +37,32 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
 	return 0;
 }
 
+/*
+ * This function assumes that the range is mapped with PAGE_SIZE pages.
+ */
+static int __change_memory_common(unsigned long start, unsigned long size,
+				pgprot_t set_mask, pgprot_t clear_mask)
+{
+	struct page_change_data data;
+	int ret;
+
+	data.set_mask = set_mask;
+	data.clear_mask = clear_mask;
+
+	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
+					&data);
+
+	flush_tlb_kernel_range(start, start + size);
+	return ret;
+}
+
 static int change_memory_common(unsigned long addr, int numpages,
 				pgprot_t set_mask, pgprot_t clear_mask)
 {
 	unsigned long start = addr;
 	unsigned long size = PAGE_SIZE*numpages;
 	unsigned long end = start + size;
-	int ret;
-	struct page_change_data data;
+	struct vm_struct *area;
 
 	if (!PAGE_ALIGNED(addr)) {
 		start &= PAGE_MASK;
@@ -51,23 +70,29 @@ static int change_memory_common(unsigned long addr, int numpages,
 		WARN_ON_ONCE(1);
 	}
 
-	if (start < MODULES_VADDR || start >= MODULES_END)
-		return -EINVAL;
-
-	if (end < MODULES_VADDR || end >= MODULES_END)
+	/*
+	 * Kernel VA mappings are always live, and splitting live section
+	 * mappings into page mappings may cause TLB conflicts. This means
+	 * we have to ensure that changing the permission bits of the range
+	 * we are operating on does not result in such splitting.
+	 *
+	 * Let's restrict ourselves to mappings created by vmalloc (or vmap).
+	 * Those are guaranteed to consist entirely of page mappings, and
+	 * splitting is never needed.
+	 *
+	 * So check whether the [addr, addr + size) interval is entirely
+	 * covered by precisely one VM area that has the VM_ALLOC flag set.
+	 */
+	area = find_vm_area((void *)addr);
+	if (!area ||
+	    end > (unsigned long)area->addr + area->size ||
+	    !(area->flags & VM_ALLOC))
 		return -EINVAL;
 
 	if (!numpages)
 		return 0;
 
-	data.set_mask = set_mask;
-	data.clear_mask = clear_mask;
-
-	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
-					&data);
-
-	flush_tlb_kernel_range(start, end);
-	return ret;
+	return __change_memory_common(start, size, set_mask, clear_mask);
 }
 
 int set_memory_ro(unsigned long addr, int numpages)
@@ -99,3 +124,32 @@ int set_memory_x(unsigned long addr, int numpages)
 					__pgprot(PTE_PXN));
 }
 EXPORT_SYMBOL_GPL(set_memory_x);
+
+
+int set_memory_valid(unsigned long addr, int numpages, int enable)
+{
+	if (enable)
+		return __change_memory_common(addr, PAGE_SIZE * numpages,
+					__pgprot(PTE_VALID),
+					__pgprot(0));
+	else
+		return __change_memory_common(addr, PAGE_SIZE * numpages,
+					__pgprot(0),
+					__pgprot(PTE_VALID));
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	unsigned long addr = (unsigned long) page_address(page);
+
+	if (enable)
+		__change_memory_common(addr, PAGE_SIZE * numpages,
+					__pgprot(PTE_VALID),
+					__pgprot(0));
+	else
+		__change_memory_common(addr, PAGE_SIZE * numpages,
+					__pgprot(0),
+					__pgprot(PTE_VALID));
+}
+#endif
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index cb3ba1b812e7..ae11d4e03d0e 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -46,14 +46,14 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 
-static int __init pgd_cache_init(void)
+void __init pgd_cache_init(void)
 {
+	if (PGD_SIZE == PAGE_SIZE)
+		return;
+
 	/*
 	 * Naturally aligned pgds required by the architecture.
 	 */
-	if (PGD_SIZE != PAGE_SIZE)
-		pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE,
-					      SLAB_PANIC, NULL);
-	return 0;
+	pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE,
+				      SLAB_PANIC, NULL);
 }
-core_initcall(pgd_cache_init);
diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S
index d69dffffaa89..984edcda1850 100644
--- a/arch/arm64/mm/proc-macros.S
+++ b/arch/arm64/mm/proc-macros.S
@@ -74,3 +74,25 @@
 	msr	pmuserenr_el0, xzr		// Disable PMU access from EL0
 9000:
 	.endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ * 	op:		operation passed to dc instruction
+ * 	domain:		domain used in dsb instruciton
+ * 	kaddr:		starting virtual address of the region
+ * 	size:		size of the region
+ * 	Corrupts: 	kaddr, size, tmp1, tmp2
+ */
+	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	dcache_line_size \tmp1, \tmp2
+	add	\size, \kaddr, \size
+	sub	\tmp2, \tmp1, #1
+	bic	\kaddr, \kaddr, \tmp2
+9998:	dc	\op, \kaddr
+	add	\kaddr, \kaddr, \tmp1
+	cmp	\kaddr, \size
+	b.lo	9998b
+	dsb	\domain
+	.endm
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index f09636738007..178af2dd15c4 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -23,13 +23,11 @@
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/hwcap.h>
-#include <asm/pgtable-hwdef.h>
 #include <asm/pgtable.h>
+#include <asm/pgtable-hwdef.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
 
-#include "proc-macros.S"
-
 #ifdef CONFIG_ARM64_64K_PAGES
 #define TCR_TG_FLAGS	TCR_TG0_64K | TCR_TG1_64K
 #elif defined(CONFIG_ARM64_16K_PAGES)
@@ -66,62 +64,59 @@ ENTRY(cpu_do_suspend)
 	mrs	x2, tpidr_el0
 	mrs	x3, tpidrro_el0
 	mrs	x4, contextidr_el1
-	mrs	x5, mair_el1
-	mrs	x6, cpacr_el1
-	mrs	x7, ttbr1_el1
-	mrs	x8, tcr_el1
-	mrs	x9, vbar_el1
-	mrs	x10, mdscr_el1
-	mrs	x11, oslsr_el1
-	mrs	x12, sctlr_el1
+	mrs	x5, cpacr_el1
+	mrs	x6, tcr_el1
+	mrs	x7, vbar_el1
+	mrs	x8, mdscr_el1
+	mrs	x9, oslsr_el1
+	mrs	x10, sctlr_el1
 	stp	x2, x3, [x0]
-	stp	x4, x5, [x0, #16]
-	stp	x6, x7, [x0, #32]
-	stp	x8, x9, [x0, #48]
-	stp	x10, x11, [x0, #64]
-	str	x12, [x0, #80]
+	stp	x4, xzr, [x0, #16]
+	stp	x5, x6, [x0, #32]
+	stp	x7, x8, [x0, #48]
+	stp	x9, x10, [x0, #64]
 	ret
 ENDPROC(cpu_do_suspend)
 
 /**
  * cpu_do_resume - restore CPU register context
  *
- * x0: Physical address of context pointer
- * x1: ttbr0_el1 to be restored
- *
- * Returns:
- *	sctlr_el1 value in x0
+ * x0: Address of context pointer
  */
 ENTRY(cpu_do_resume)
-	/*
-	 * Invalidate local tlb entries before turning on MMU
-	 */
-	tlbi	vmalle1
 	ldp	x2, x3, [x0]
 	ldp	x4, x5, [x0, #16]
-	ldp	x6, x7, [x0, #32]
-	ldp	x8, x9, [x0, #48]
-	ldp	x10, x11, [x0, #64]
-	ldr	x12, [x0, #80]
+	ldp	x6, x8, [x0, #32]
+	ldp	x9, x10, [x0, #48]
+	ldp	x11, x12, [x0, #64]
 	msr	tpidr_el0, x2
 	msr	tpidrro_el0, x3
 	msr	contextidr_el1, x4
-	msr	mair_el1, x5
 	msr	cpacr_el1, x6
-	msr	ttbr0_el1, x1
-	msr	ttbr1_el1, x7
-	tcr_set_idmap_t0sz x8, x7
+
+	/* Don't change t0sz here, mask those bits when restoring */
+	mrs	x5, tcr_el1
+	bfi	x8, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+
 	msr	tcr_el1, x8
 	msr	vbar_el1, x9
+
+	/*
+	 * __cpu_setup() cleared MDSCR_EL1.MDE and friends, before unmasking
+	 * debug exceptions. By restoring MDSCR_EL1 here, we may take a debug
+	 * exception. Mask them until local_dbg_restore() in cpu_suspend()
+	 * resets them.
+	 */
+	disable_dbg
 	msr	mdscr_el1, x10
+
+	msr	sctlr_el1, x12
 	/*
 	 * Restore oslsr_el1 by writing oslar_el1
 	 */
 	ubfx	x11, x11, #1, #1
 	msr	oslar_el1, x11
 	reset_pmuserenr_el0 x0			// Disable PMU access from EL0
-	mov	x0, x12
-	dsb	nsh		// Make sure local tlb invalidation completed
 	isb
 	ret
 ENDPROC(cpu_do_resume)
@@ -152,7 +147,33 @@ alternative_else
 alternative_endif
 ENDPROC(cpu_do_switch_mm)
 
-	.section ".text.init", #alloc, #execinstr
+	.pushsection ".idmap.text", "ax"
+/*
+ * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd)
+ *
+ * This is the low-level counterpart to cpu_replace_ttbr1, and should not be
+ * called by anything else. It can only be executed from a TTBR0 mapping.
+ */
+ENTRY(idmap_cpu_replace_ttbr1)
+	mrs	x2, daif
+	msr	daifset, #0xf
+
+	adrp	x1, empty_zero_page
+	msr	ttbr1_el1, x1
+	isb
+
+	tlbi	vmalle1
+	dsb	nsh
+	isb
+
+	msr	ttbr1_el1, x0
+	isb
+
+	msr	daif, x2
+
+	ret
+ENDPROC(idmap_cpu_replace_ttbr1)
+	.popsection
 
 /*
  *	__cpu_setup