diff options
Diffstat (limited to 'arch/arm64/kernel')
31 files changed, 926 insertions, 631 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 96ae961ada72..01c0b3881f88 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -52,7 +52,3 @@ obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) head-y := head.o extra-y += $(head-y) vmlinux.lds - -# vDSO - this must be built first to generate the symbol offsets -$(call objectify,$(arm64-obj-y)): $(obj)/vdso/vdso-offsets.h -$(obj)/vdso/vdso-offsets.h: $(obj)/vdso diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index d2ee1b21a10d..4434dabde898 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -25,14 +25,13 @@ #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> +#include <asm/sections.h> #include <linux/stop_machine.h> #define __ALT_PTR(a,f) (u32 *)((void *)&(a)->f + (a)->f) #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - struct alt_region { struct alt_instr *begin; struct alt_instr *end; @@ -124,8 +123,8 @@ static int __apply_alternatives_multi_stop(void *unused) { static int patched = 0; struct alt_region region = { - .begin = __alt_instructions, - .end = __alt_instructions_end, + .begin = (struct alt_instr *)__alt_instructions, + .end = (struct alt_instr *)__alt_instructions_end, }; /* We always have a CPU 0 at this point (__init) */ diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index d03e311b6f3c..10d3642deb7c 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -14,7 +14,6 @@ #include <linux/slab.h> #include <linux/sysctl.h> -#include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> #include <asm/opcodes.h> @@ -281,9 +280,9 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) * Error-checking SWP macros implemented using ldxr{b}/stxr{b} */ #define __user_swpX_asm(data, addr, res, temp, B) \ +do { \ + uaccess_enable(); \ __asm__ __volatile__( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%3]\n" \ "1: stxr"B" %w0, %w1, [%3]\n" \ " cbz %w0, 2f\n" \ @@ -299,12 +298,12 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ : "r" ((unsigned long)addr), "i" (-EAGAIN), \ "i" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) #define __user_swp_asm(data, addr, res, temp) \ __user_swpX_asm(data, addr, res, temp, "") diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index a506ddc270c0..67ebe708e30c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -35,11 +35,19 @@ int main(void) { DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); + DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); +#else DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); - DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); +#endif +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct thread_info, ttbr0)); +#endif BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); BLANK(); @@ -89,6 +97,7 @@ int main(void) BLANK(); DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); + DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW); DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE); @@ -96,6 +105,8 @@ int main(void) DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); BLANK(); DEFINE(VDSO_CS_CYCLE_LAST, offsetof(struct vdso_data, cs_cycle_last)); + DEFINE(VDSO_RAW_TIME_SEC, offsetof(struct vdso_data, raw_time_sec)); + DEFINE(VDSO_RAW_TIME_NSEC, offsetof(struct vdso_data, raw_time_nsec)); DEFINE(VDSO_XTIME_CLK_SEC, offsetof(struct vdso_data, xtime_clock_sec)); DEFINE(VDSO_XTIME_CLK_NSEC, offsetof(struct vdso_data, xtime_clock_nsec)); DEFINE(VDSO_XTIME_CRS_SEC, offsetof(struct vdso_data, xtime_coarse_sec)); @@ -103,7 +114,8 @@ int main(void) DEFINE(VDSO_WTM_CLK_SEC, offsetof(struct vdso_data, wtm_clock_sec)); DEFINE(VDSO_WTM_CLK_NSEC, offsetof(struct vdso_data, wtm_clock_nsec)); DEFINE(VDSO_TB_SEQ_COUNT, offsetof(struct vdso_data, tb_seq_count)); - DEFINE(VDSO_CS_MULT, offsetof(struct vdso_data, cs_mult)); + DEFINE(VDSO_CS_MONO_MULT, offsetof(struct vdso_data, cs_mono_mult)); + DEFINE(VDSO_CS_RAW_MULT, offsetof(struct vdso_data, cs_raw_mult)); DEFINE(VDSO_CS_SHIFT, offsetof(struct vdso_data, cs_shift)); DEFINE(VDSO_TZ_MINWEST, offsetof(struct vdso_data, tz_minuteswest)); DEFINE(VDSO_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime)); @@ -119,6 +131,11 @@ int main(void) BLANK(); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); BLANK(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); + DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); + BLANK(); +#endif #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 476fb5caa361..53fab76d3c39 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -46,6 +46,7 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +EXPORT_SYMBOL(cpu_hwcaps); #define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 40270c1fa947..b60da856fc19 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -11,317 +11,34 @@ * */ -#include <linux/atomic.h> #include <linux/dmi.h> #include <linux/efi.h> -#include <linux/export.h> -#include <linux/memblock.h> -#include <linux/mm_types.h> -#include <linux/bootmem.h> -#include <linux/of.h> -#include <linux/of_fdt.h> -#include <linux/preempt.h> -#include <linux/rbtree.h> -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/spinlock.h> +#include <linux/init.h> -#include <asm/cacheflush.h> #include <asm/efi.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/mmu.h> -#include <asm/pgtable.h> -struct efi_memory_map memmap; - -static u64 efi_system_table; - -static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; - -static struct mm_struct efi_mm = { - .mm_rb = RB_ROOT, - .pgd = efi_pgd, - .mm_users = ATOMIC_INIT(2), - .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), - .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), - .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), -}; - -static int __init is_normal_ram(efi_memory_desc_t *md) -{ - if (md->attribute & EFI_MEMORY_WB) - return 1; - return 0; -} - -/* - * Translate a EFI virtual address into a physical address: this is necessary, - * as some data members of the EFI system table are virtually remapped after - * SetVirtualAddressMap() has been called. - */ -static phys_addr_t efi_to_phys(unsigned long addr) +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { - efi_memory_desc_t *md; - - for_each_efi_memory_desc(&memmap, md) { - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - if (md->virt_addr == 0) - /* no virtual mapping has been installed by the stub */ - break; - if (md->virt_addr <= addr && - (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT)) - return md->phys_addr + addr - md->virt_addr; - } - return addr; -} - -static int __init uefi_init(void) -{ - efi_char16_t *c16; - void *config_tables; - u64 table_size; - char vendor[100] = "unknown"; - int i, retval; - - efi.systab = early_memremap(efi_system_table, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) { - pr_warn("Unable to map EFI system table.\n"); - return -ENOMEM; - } - - set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); + pteval_t prot_val; /* - * Verify the EFI Table + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. */ - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { - pr_err("System table signature incorrect\n"); - retval = -EINVAL; - goto out; - } - if ((efi.systab->hdr.revision >> 16) < 2) - pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* Show what we know for posterity */ - c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor), - sizeof(vendor) * sizeof(efi_char16_t)); - if (c16) { - for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) - vendor[i] = c16[i]; - vendor[i] = '\0'; - early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t)); - } - - pr_info("EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables; - config_tables = early_memremap(efi_to_phys(efi.systab->tables), - table_size); - if (config_tables == NULL) { - pr_warn("Unable to map EFI config table array.\n"); - retval = -ENOMEM; - goto out; - } - retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_64_t), NULL); - - early_memunmap(config_tables, table_size); -out: - early_memunmap(efi.systab, sizeof(efi_system_table_t)); - return retval; -} - -/* - * Return true for RAM regions we want to permanently reserve. - */ -static __init int is_reserve_region(efi_memory_desc_t *md) -{ - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - case EFI_PERSISTENT_MEMORY: - return 0; - default: - break; - } - return is_normal_ram(md); -} - -static __init void reserve_regions(void) -{ - efi_memory_desc_t *md; - u64 paddr, npages, size; - - if (efi_enabled(EFI_DBG)) - pr_info("Processing EFI memory map:\n"); - - for_each_efi_memory_desc(&memmap, md) { - paddr = md->phys_addr; - npages = md->num_pages; - - if (efi_enabled(EFI_DBG)) { - char buf[64]; - - pr_info(" 0x%012llx-0x%012llx %s", - paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, - efi_md_typeattr_format(buf, sizeof(buf), md)); - } - - memrange_efi_to_native(&paddr, &npages); - size = npages << PAGE_SHIFT; - - if (is_normal_ram(md)) - early_init_dt_add_memory_arch(paddr, size); - - if (is_reserve_region(md)) { - memblock_reserve(paddr, size); - if (efi_enabled(EFI_DBG)) - pr_cont("*"); - } - - if (efi_enabled(EFI_DBG)) - pr_cont("\n"); - } - - set_bit(EFI_MEMMAP, &efi.flags); -} - -void __init efi_init(void) -{ - struct efi_fdt_params params; - - /* Grab UEFI information placed in FDT by stub */ - if (!efi_get_fdt_params(¶ms)) - return; - - efi_system_table = params.system_table; - - memblock_reserve(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + (params.mmap & ~PAGE_MASK))); - memmap.phys_map = params.mmap; - memmap.map = early_memremap(params.mmap, params.mmap_size); - if (memmap.map == NULL) { - /* - * If we are booting via UEFI, the UEFI memory map is the only - * description of memory we have, so there is little point in - * proceeding if we cannot access it. - */ - panic("Unable to map EFI memory map.\n"); - } - memmap.map_end = memmap.map + params.mmap_size; - memmap.desc_size = params.desc_size; - memmap.desc_version = params.desc_ver; - - if (uefi_init() < 0) - return; - - reserve_regions(); - early_memunmap(memmap.map, params.mmap_size); -} - -static bool __init efi_virtmap_init(void) -{ - efi_memory_desc_t *md; - - init_new_context(NULL, &efi_mm); - - for_each_efi_memory_desc(&memmap, md) { - pgprot_t prot; - - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - if (md->virt_addr == 0) - return false; - - pr_info(" EFI remap 0x%016llx => %p\n", - md->phys_addr, (void *)md->virt_addr); - - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if (!is_normal_ram(md)) - prot = __pgprot(PROT_DEVICE_nGnRE); - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot = PAGE_KERNEL_EXEC; - else - prot = PAGE_KERNEL; - - create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, - md->num_pages << EFI_PAGE_SHIFT, - __pgprot(pgprot_val(prot) | PTE_NG), true); - } - return true; -} - -/* - * Enable the UEFI Runtime Services if all prerequisites are in place, i.e., - * non-early mapping of the UEFI system table and virtual mappings for all - * EFI_MEMORY_RUNTIME regions. - */ -static int __init arm64_enable_runtime_services(void) -{ - u64 mapsize; - - if (!efi_enabled(EFI_BOOT)) { - pr_info("EFI services will not be available.\n"); - return 0; - } - - if (efi_runtime_disabled()) { - pr_info("EFI runtime services will be disabled.\n"); - return 0; - } - - pr_info("Remapping and enabling EFI services.\n"); - - mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache(memmap.phys_map, - mapsize); - if (!memmap.map) { - pr_err("Failed to remap EFI memory map\n"); - return -ENOMEM; - } - memmap.map_end = memmap.map + mapsize; - efi.memmap = &memmap; - - efi.systab = (__force void *)ioremap_cache(efi_system_table, - sizeof(efi_system_table_t)); - if (!efi.systab) { - pr_err("Failed to remap EFI System Table\n"); - return -ENOMEM; - } - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - - if (!efi_virtmap_init()) { - pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); - return -ENOMEM; - } - - /* Set up runtime services function pointers */ - efi_native_runtime_setup(); - set_bit(EFI_RUNTIME_SERVICES, &efi.flags); - - efi.runtime_version = efi.systab->hdr.revision; - + if ((md->attribute & EFI_MEMORY_WB) == 0) + prot_val = PROT_DEVICE_nGnRE; + else if (md->type == EFI_RUNTIME_SERVICES_CODE || + !PAGE_ALIGNED(md->phys_addr)) + prot_val = pgprot_val(PAGE_KERNEL_EXEC); + else + prot_val = pgprot_val(PAGE_KERNEL); + + create_pgd_mapping(mm, md->phys_addr, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + __pgprot(prot_val | PTE_NG), true); return 0; } -early_initcall(arm64_enable_runtime_services); static int __init arm64_dmi_init(void) { @@ -337,23 +54,6 @@ static int __init arm64_dmi_init(void) } core_initcall(arm64_dmi_init); -static void efi_set_pgd(struct mm_struct *mm) -{ - switch_mm(NULL, mm, NULL); -} - -void efi_virtmap_load(void) -{ - preempt_disable(); - efi_set_pgd(&efi_mm); -} - -void efi_virtmap_unload(void) -{ - efi_set_pgd(current->active_mm); - preempt_enable(); -} - /* * UpdateCapsule() depends on the system being shutdown via * ResetSystem(). diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6f69650c8b7b..b5119f7f5cd6 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,7 +29,9 @@ #include <asm/esr.h> #include <asm/irq.h> #include <asm/memory.h> +#include <asm/ptrace.h> #include <asm/thread_info.h> +#include <asm/uaccess.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> @@ -91,9 +93,14 @@ .if \el == 0 mrs x21, sp_el0 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, + ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug +#else mov tsk, sp and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug +#endif disable_step_tsk x19, x20 // exceptions when scheduling. mov x29, xzr // fp pointed to user-space @@ -101,14 +108,48 @@ add x21, sp, #S_FRAME_SIZE get_thread_info tsk /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] +#else ldr x20, [tsk, #TI_ADDR_LIMIT] +#endif str x20, [sp, #S_ORIG_ADDR_LIMIT] mov x20, #TASK_SIZE_64 +#ifdef CONFIG_THREAD_INFO_IN_TASK + str x20, [tsk, #TSK_TI_ADDR_LIMIT] +#else str x20, [tsk, #TI_ADDR_LIMIT] +#endif .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +alternative_if ARM64_HAS_PAN + b 1f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + mrs x21, ttbr0_el1 + tst x21, #0xffff << 48 // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR + .endif + + __uaccess_ttbr0_disable x21 +1: +#endif + stp x22, x23, [sp, #S_PC] /* @@ -139,12 +180,50 @@ .if \el != 0 /* Restore the task's original addr_limit. */ ldr x20, [sp, #S_ORIG_ADDR_LIMIT] +#ifdef CONFIG_THREAD_INFO_IN_TASK + str x20, [tsk, #TSK_TI_ADDR_LIMIT] +#else str x20, [tsk, #TI_ADDR_LIMIT] +#endif .endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter + .endif + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +alternative_if ARM64_HAS_PAN + b 2f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + .endif + + __uaccess_ttbr0_enable x0 + + .if \el == 0 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + post_ttbr0_update_workaround + .endif +1: + .if \el != 0 + and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + .endif +2: +#endif + + .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -166,6 +245,7 @@ alternative_else alternative_endif #endif .endif + msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] @@ -188,23 +268,26 @@ alternative_endif eret // return to kernel .endm - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - .macro irq_stack_entry mov x19, sp // preserve the original sp /* - * Compare sp with the current thread_info, if the top - * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and - * should switch to the irq stack. + * Compare sp with the base of the task stack. + * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, + * and should switch to the irq stack. */ +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x25, [tsk, TSK_STACK] + eor x25, x25, x19 + and x25, x25, #~(THREAD_SIZE - 1) + cbnz x25, 9998f +#else and x25, x19, #~(THREAD_SIZE - 1) cmp x25, tsk b.ne 9998f +#endif - this_cpu_ptr irq_stack, x25, x26 + adr_this_cpu x25, irq_stack, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 @@ -292,7 +375,7 @@ END(vectors) * Invalid mode handlers */ .macro inv_entry, el, reason, regsize = 64 - kernel_entry el, \regsize + kernel_entry \el, \regsize mov x0, sp mov x1, #\reason mrs x2, esr_el1 @@ -351,6 +434,8 @@ el1_sync: lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1 b.eq el1_da + cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1 + b.eq el1_ia cmp x24, #ESR_ELx_EC_SYS64 // configurable trap b.eq el1_undef cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception @@ -362,6 +447,11 @@ el1_sync: cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1 b.ge el1_dbg b el1_inv + +el1_ia: + /* + * Fall through to the Data abort case + */ el1_da: /* * Data abort handling @@ -426,9 +516,17 @@ el1_irq: irq_handler #ifdef CONFIG_PREEMPT +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count +#else ldr w24, [tsk, #TI_PREEMPT] // get preempt count +#endif cbnz w24, 1f // preempt count != 0 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x0, [tsk, #TSK_TI_FLAGS] // get flags +#else ldr x0, [tsk, #TI_FLAGS] // get flags +#endif tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? bl el1_preempt 1: @@ -443,7 +541,11 @@ ENDPROC(el1_irq) el1_preempt: mov x24, lr 1: bl preempt_schedule_irq // irq en/disable is done inside +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS +#else ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS +#endif tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? ret x24 #endif @@ -548,7 +650,7 @@ el0_ia: enable_dbg_and_irq ct_user_exit mov x0, x26 - orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts + mov x1, x25 mov x2, sp bl do_mem_abort b ret_to_user @@ -663,8 +765,12 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 +#ifdef CONFIG_THREAD_INFO_IN_TASK + msr sp_el0, x1 +#else and x9, x9, #~(THREAD_SIZE - 1) msr sp_el0, x9 +#endif ret ENDPROC(cpu_switch_to) @@ -675,7 +781,11 @@ ENDPROC(cpu_switch_to) ret_fast_syscall: disable_irq // disable interrupts str x0, [sp, #S_X0] // returned x0 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing +#else ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing +#endif and x2, x1, #_TIF_SYSCALL_WORK cbnz x2, ret_fast_syscall_trace and x2, x1, #_TIF_WORK_MASK @@ -707,7 +817,11 @@ work_resched: */ ret_to_user: disable_irq // disable interrupts +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x1, [tsk, #TSK_TI_FLAGS] +#else ldr x1, [tsk, #TI_FLAGS] +#endif and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending enable_step_tsk x1, x2 @@ -739,7 +853,11 @@ el0_svc_naked: // compat entry point enable_dbg_and_irq ct_user_exit 1 +#ifdef CONFIG_THREAD_INFO_IN_TASK + ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks +#else ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks +#endif tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace cmp scno, sc_nr // check upper syscall limit diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 23cf8e9a295f..3e99814b6463 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -319,14 +319,14 @@ __create_page_tables: * dirty cache lines being evicted. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ mov x0, x25 - add x6, x26, #SWAPPER_DIR_SIZE + add x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -405,7 +405,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ mov x0, x25 - add x1, x26, #SWAPPER_DIR_SIZE + add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE dmb sy bl __inval_cache_range @@ -419,6 +419,7 @@ ENDPROC(__create_page_tables) .set initial_sp, init_thread_union + THREAD_START_SP __primary_switched: mov x28, lr // preserve LR + adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb @@ -431,10 +432,18 @@ __primary_switched: bl __pi_memset dsb ishst // Make zero page visible to PTW +#ifdef CONFIG_THREAD_INFO_IN_TASK + adrp x4, init_thread_union + add sp, x4, #THREAD_SIZE + adr_l x5, init_task + msr sp_el0, x5 // Save thread_info +#else adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) msr sp_el0, x4 // Save thread_info +#endif + str_l x21, __fdt_pointer, x5 // Save FDT pointer ldr_l x4, kimage_vaddr // Save the offset between @@ -464,7 +473,7 @@ ENDPROC(__primary_switched) * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region */ - .section ".text","ax" + .section ".idmap.text","ax" ENTRY(kimage_vaddr) .quad _text - TEXT_OFFSET @@ -587,17 +596,23 @@ set_cpu_boot_mode_flag: ENDPROC(set_cpu_boot_mode_flag) /* + * These values are written with the MMU off, but read with the MMU on. + * Writers will invalidate the corresponding address, discarding up to a + * 'Cache Writeback Granule' (CWG) worth of data. The linker script ensures + * sufficient alignment that the CWG doesn't overlap another section. + */ + .pushsection ".mmuoff.data.write", "aw" +/* * We need to find out the CPU boot mode long after boot, so we need to * store it in a writable variable. * * This is not in .bss, because we set it sufficiently early that the boot-time * zeroing of .bss would clobber it. */ - .pushsection .data..cacheline_aligned - .align L1_CACHE_SHIFT ENTRY(__boot_cpu_mode) .long BOOT_CPU_MODE_EL2 .long BOOT_CPU_MODE_EL1 + .popsection /* @@ -644,12 +659,19 @@ __secondary_switched: adr_l x5, vectors msr vbar_el1, x5 isb - +#ifdef CONFIG_THREAD_INFO_IN_TASK + adr_l x0, secondary_data + ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack + mov sp, x1 + ldr x2, [x0, #CPU_BOOT_TASK] + msr sp_el0, x2 +#else adr_l x0, secondary_data ldr x0, [x0, #CPU_BOOT_STACK] // get secondary_data.stack mov sp, x0 and x0, x0, #~(THREAD_SIZE - 1) msr sp_el0, x0 // save thread_info +#endif mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) @@ -688,7 +710,6 @@ ENTRY(__early_cpu_boot_status) * Checks if the selected granule size is supported by the CPU. * If it isn't, park the CPU */ - .section ".idmap.text", "ax" ENTRY(__enable_mmu) mrs x18, sctlr_el1 // preserve old SCTLR_EL1 value mrs x1, ID_AA64MMFR0_EL1 diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 35a33d705536..4284ced551fe 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -54,12 +54,6 @@ extern int in_suspend; /* Do we need to reset el2? */ #define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) -/* - * Start/end of the hibernate exit code, this must be copied to a 'safe' - * location in memory, and executed from there. - */ -extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; - /* temporary el2 vectors in the __hibernate_exit_text section. */ extern char hibernate_el2_vectors[]; @@ -242,6 +236,7 @@ out: return rc; } +#define dcache_clean_range(start, end) __flush_dcache_area(start, (end - start)) int swsusp_arch_suspend(void) { @@ -257,8 +252,13 @@ int swsusp_arch_suspend(void) ret = swsusp_save(); } else { - /* Clean kernel to PoC for secondary core startup */ - __flush_dcache_area(LMADDR(KERNEL_START), KERNEL_END - KERNEL_START); + /* Clean kernel core startup/idle code to PoC*/ + dcache_clean_range(__mmuoff_data_start, __mmuoff_data_end); + dcache_clean_range(__idmap_text_start, __idmap_text_end); + + /* Clean kvm setup code to PoC? */ + if (el2_reset_needed()) + dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end); /* make the crash dump kernel image protected again */ crash_post_resume(); @@ -277,6 +277,33 @@ int swsusp_arch_suspend(void) return ret; } +static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) +{ + pte_t pte = *src_pte; + + if (pte_valid(pte)) { + /* + * Resume will overwrite areas that may be marked + * read only (code, rodata). Clear the RDONLY bit from + * the temporary mappings we use during restore. + */ + set_pte(dst_pte, pte_clear_rdonly(pte)); + } else if (debug_pagealloc_enabled() && !pte_none(pte)) { + /* + * debug_pagealloc will removed the PTE_VALID bit if + * the page isn't in use by the resume kernel. It may have + * been in use by the original kernel, in which case we need + * to put it back in our copy to do the restore. + * + * Before marking this entry valid, check the pfn should + * be mapped. + */ + BUG_ON(!pfn_valid(pte_pfn(pte))); + + set_pte(dst_pte, pte_mkpresent(pte_clear_rdonly(pte))); + } +} + static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start, unsigned long end) { @@ -292,13 +319,7 @@ static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start, src_pte = pte_offset_kernel(src_pmd, start); do { - if (!pte_none(*src_pte)) - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_pte, __pte(pte_val(*src_pte) & ~PTE_RDONLY)); + _copy_pte(dst_pte, src_pte, addr); } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); return 0; diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 22543cb1a8e1..1c694f3c643c 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -318,9 +318,21 @@ static int get_hbp_len(u8 hbp_len) case ARM_BREAKPOINT_LEN_2: len_in_bytes = 2; break; + case ARM_BREAKPOINT_LEN_3: + len_in_bytes = 3; + break; case ARM_BREAKPOINT_LEN_4: len_in_bytes = 4; break; + case ARM_BREAKPOINT_LEN_5: + len_in_bytes = 5; + break; + case ARM_BREAKPOINT_LEN_6: + len_in_bytes = 6; + break; + case ARM_BREAKPOINT_LEN_7: + len_in_bytes = 7; + break; case ARM_BREAKPOINT_LEN_8: len_in_bytes = 8; break; @@ -350,7 +362,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type) + int *gen_len, int *gen_type, int *offset) { /* Type */ switch (ctrl.type) { @@ -370,17 +382,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } + if (!ctrl.len) + return -EINVAL; + *offset = __ffs(ctrl.len); + /* Len */ - switch (ctrl.len) { + switch (ctrl.len >> *offset) { case ARM_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; case ARM_BREAKPOINT_LEN_2: *gen_len = HW_BREAKPOINT_LEN_2; break; + case ARM_BREAKPOINT_LEN_3: + *gen_len = HW_BREAKPOINT_LEN_3; + break; case ARM_BREAKPOINT_LEN_4: *gen_len = HW_BREAKPOINT_LEN_4; break; + case ARM_BREAKPOINT_LEN_5: + *gen_len = HW_BREAKPOINT_LEN_5; + break; + case ARM_BREAKPOINT_LEN_6: + *gen_len = HW_BREAKPOINT_LEN_6; + break; + case ARM_BREAKPOINT_LEN_7: + *gen_len = HW_BREAKPOINT_LEN_7; + break; case ARM_BREAKPOINT_LEN_8: *gen_len = HW_BREAKPOINT_LEN_8; break; @@ -424,9 +452,21 @@ static int arch_build_bp_info(struct perf_event *bp) case HW_BREAKPOINT_LEN_2: info->ctrl.len = ARM_BREAKPOINT_LEN_2; break; + case HW_BREAKPOINT_LEN_3: + info->ctrl.len = ARM_BREAKPOINT_LEN_3; + break; case HW_BREAKPOINT_LEN_4: info->ctrl.len = ARM_BREAKPOINT_LEN_4; break; + case HW_BREAKPOINT_LEN_5: + info->ctrl.len = ARM_BREAKPOINT_LEN_5; + break; + case HW_BREAKPOINT_LEN_6: + info->ctrl.len = ARM_BREAKPOINT_LEN_6; + break; + case HW_BREAKPOINT_LEN_7: + info->ctrl.len = ARM_BREAKPOINT_LEN_7; + break; case HW_BREAKPOINT_LEN_8: info->ctrl.len = ARM_BREAKPOINT_LEN_8; break; @@ -518,18 +558,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) default: return -EINVAL; } - - info->address &= ~alignment_mask; - info->ctrl.len <<= offset; } else { if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) alignment_mask = 0x3; else alignment_mask = 0x7; - if (info->address & alignment_mask) - return -EINVAL; + offset = info->address & alignment_mask; } + info->address &= ~alignment_mask; + info->ctrl.len <<= offset; + /* * Disallow per-task kernel breakpoints since these would * complicate the stepping code. @@ -662,12 +701,47 @@ unlock: } NOKPROBE_SYMBOL(breakpoint_handler); +/* + * Arm64 hardware does not always report a watchpoint hit address that matches + * one of the watchpoints set. It can also report an address "near" the + * watchpoint if a single instruction access both watched and unwatched + * addresses. There is no straight-forward way, short of disassembling the + * offending instruction, to map that address back to the watchpoint. This + * function computes the distance of the memory access from the watchpoint as a + * heuristic for the likelyhood that a given access triggered the watchpoint. + * + * See Section D2.10.5 "Determining the memory location that caused a Watchpoint + * exception" of ARMv8 Architecture Reference Manual for details. + * + * The function returns the distance of the address from the bytes watched by + * the watchpoint. In case of an exact match, it returns 0. + */ +static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, + struct arch_hw_breakpoint_ctrl *ctrl) +{ + u64 wp_low, wp_high; + u32 lens, lene; + + lens = __ffs(ctrl->len); + lene = __fls(ctrl->len); + + wp_low = val + lens; + wp_high = val + lene; + if (addr < wp_low) + return wp_low - addr; + else if (addr > wp_high) + return addr - wp_high; + else + return 0; +} + static int watchpoint_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - int i, step = 0, *kernel_step, access; + int i, step = 0, *kernel_step, access, closest_match = 0; + u64 min_dist = -1, dist; u32 ctrl_reg; - u64 val, alignment_mask; + u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; struct arch_hw_breakpoint *info; @@ -676,35 +750,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, slots = this_cpu_ptr(wp_on_reg); debug_info = ¤t->thread.debug; + /* + * Find all watchpoints that match the reported address. If no exact + * match is found. Attribute the hit to the closest watchpoint. + */ + rcu_read_lock(); for (i = 0; i < core_num_wrps; ++i) { - rcu_read_lock(); - wp = slots[i]; - if (wp == NULL) - goto unlock; - - info = counter_arch_bp(wp); - /* AArch32 watchpoints are either 4 or 8 bytes aligned. */ - if (is_compat_task()) { - if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) - alignment_mask = 0x7; - else - alignment_mask = 0x3; - } else { - alignment_mask = 0x7; - } - - /* Check if the watchpoint value matches. */ - val = read_wb_reg(AARCH64_DBG_REG_WVR, i); - if (val != (untagged_addr(addr) & ~alignment_mask)) - goto unlock; - - /* Possible match, check the byte address select to confirm. */ - ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); - decode_ctrl_reg(ctrl_reg, &ctrl); - if (!((1 << (addr & alignment_mask)) & ctrl.len)) - goto unlock; + continue; /* * Check that the access type matches. @@ -713,18 +767,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) - goto unlock; + continue; + /* Check if the watchpoint value and byte select match. */ + val = read_wb_reg(AARCH64_DBG_REG_WVR, i); + ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); + decode_ctrl_reg(ctrl_reg, &ctrl); + dist = get_distance_from_watchpoint(addr, val, &ctrl); + if (dist < min_dist) { + min_dist = dist; + closest_match = i; + } + /* Is this an exact match? */ + if (dist != 0) + continue; + + info = counter_arch_bp(wp); info->trigger = addr; perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ if (!wp->overflow_handler) step = 1; + } + if (min_dist > 0 && min_dist != -1) { + /* No exact match found. */ + wp = slots[closest_match]; + info = counter_arch_bp(wp); + info->trigger = addr; + perf_bp_event(wp, regs); -unlock: - rcu_read_unlock(); + /* Do we need to handle the stepping? */ + if (!wp->overflow_handler) + step = 1; } + rcu_read_unlock(); if (!step) return 0; diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index 354be2a872ae..79b17384effa 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -25,8 +25,7 @@ */ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) { - while (count && (!IS_ALIGNED((unsigned long)from, 8) || - !IS_ALIGNED((unsigned long)to, 8))) { + while (count && !IS_ALIGNED((unsigned long)from, 8)) { *(u8 *)to = __raw_readb(from); from++; to++; @@ -54,23 +53,22 @@ EXPORT_SYMBOL(__memcpy_fromio); */ void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) { - while (count && (!IS_ALIGNED((unsigned long)to, 8) || - !IS_ALIGNED((unsigned long)from, 8))) { - __raw_writeb(*(volatile u8 *)from, to); + while (count && !IS_ALIGNED((unsigned long)to, 8)) { + __raw_writeb(*(u8 *)from, to); from++; to++; count--; } while (count >= 8) { - __raw_writeq(*(volatile u64 *)from, to); + __raw_writeq(*(u64 *)from, to); from += 8; to += 8; count -= 8; } while (count) { - __raw_writeb(*(volatile u8 *)from, to); + __raw_writeb(*(u8 *)from, to); from++; to++; count--; diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index 1ce90d8450ae..d05dbe658409 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014-2016 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014-2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -26,35 +26,21 @@ struct plt_entry { __le32 br; /* br x16 */ }; -u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, +static bool in_init(const struct module *mod, void *loc) +{ + return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size; +} + +u64 module_emit_plt_entry(struct module *mod, void *loc, const Elf64_Rela *rela, Elf64_Sym *sym) { - struct plt_entry *plt = (struct plt_entry *)mod->arch.plt->sh_addr; - int i = mod->arch.plt_num_entries; + struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : + &mod->arch.init; + struct plt_entry *plt = (struct plt_entry *)pltsec->plt->sh_addr; + int i = pltsec->plt_num_entries; u64 val = sym->st_value + rela->r_addend; /* - * We only emit PLT entries against undefined (SHN_UNDEF) symbols, - * which are listed in the ELF symtab section, but without a type - * or a size. - * So, similar to how the module loader uses the Elf64_Sym::st_value - * field to store the resolved addresses of undefined symbols, let's - * borrow the Elf64_Sym::st_size field (whose value is never used by - * the module loader, even for symbols that are defined) to record - * the address of a symbol's associated PLT entry as we emit it for a - * zero addend relocation (which is the only kind we have to deal with - * in practice). This allows us to find duplicates without having to - * go through the table every time. - */ - if (rela->r_addend == 0 && sym->st_size != 0) { - BUG_ON(sym->st_size < (u64)plt || sym->st_size >= (u64)&plt[i]); - return sym->st_size; - } - - mod->arch.plt_num_entries++; - BUG_ON(mod->arch.plt_num_entries > mod->arch.plt_max_entries); - - /* * MOVK/MOVN/MOVZ opcode: * +--------+------------+--------+-----------+-------------+---------+ * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] | @@ -72,8 +58,19 @@ u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, cpu_to_le32(0xd61f0200) }; - if (rela->r_addend == 0) - sym->st_size = (u64)&plt[i]; + /* + * Check if the entry we just created is a duplicate. Given that the + * relocations are sorted, this will be the last entry we allocated. + * (if one exists). + */ + if (i > 0 && + plt[i].mov0 == plt[i - 1].mov0 && + plt[i].mov1 == plt[i - 1].mov1 && + plt[i].mov2 == plt[i - 1].mov2) + return (u64)&plt[i - 1]; + + pltsec->plt_num_entries++; + BUG_ON(pltsec->plt_num_entries > pltsec->plt_max_entries); return (u64)&plt[i]; } @@ -104,7 +101,8 @@ static bool duplicate_rel(const Elf64_Rela *rela, int num) return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0; } -static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num) +static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, + Elf64_Word dstidx) { unsigned int ret = 0; Elf64_Sym *s; @@ -116,13 +114,17 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num) case R_AARCH64_CALL26: /* * We only have to consider branch targets that resolve - * to undefined symbols. This is not simply a heuristic, - * it is a fundamental limitation, since the PLT itself - * is part of the module, and needs to be within 128 MB - * as well, so modules can never grow beyond that limit. + * to symbols that are defined in a different section. + * This is not simply a heuristic, it is a fundamental + * limitation, since there is no guaranteed way to emit + * PLT entries sufficiently close to the branch if the + * section size exceeds the range of a branch + * instruction. So ignore relocations against defined + * symbols if they live in the same section as the + * relocation target. */ s = syms + ELF64_R_SYM(rela[i].r_info); - if (s->st_shndx != SHN_UNDEF) + if (s->st_shndx == dstidx) break; /* @@ -149,7 +151,8 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num) int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { - unsigned long plt_max_entries = 0; + unsigned long core_plts = 0; + unsigned long init_plts = 0; Elf64_Sym *syms = NULL; int i; @@ -158,14 +161,16 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, * entries. Record the symtab address as well. */ for (i = 0; i < ehdr->e_shnum; i++) { - if (strcmp(".plt", secstrings + sechdrs[i].sh_name) == 0) - mod->arch.plt = sechdrs + i; + if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt")) + mod->arch.core.plt = sechdrs + i; + else if (!strcmp(secstrings + sechdrs[i].sh_name, ".init.plt")) + mod->arch.init.plt = sechdrs + i; else if (sechdrs[i].sh_type == SHT_SYMTAB) syms = (Elf64_Sym *)sechdrs[i].sh_addr; } - if (!mod->arch.plt) { - pr_err("%s: module PLT section missing\n", mod->name); + if (!mod->arch.core.plt || !mod->arch.init.plt) { + pr_err("%s: module PLT section(s) missing\n", mod->name); return -ENOEXEC; } if (!syms) { @@ -188,14 +193,27 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, /* sort by type, symbol index and addend */ sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); - plt_max_entries += count_plts(syms, rels, numrels); + if (strncmp(secstrings + dstsec->sh_name, ".init", 5) != 0) + core_plts += count_plts(syms, rels, numrels, + sechdrs[i].sh_info); + else + init_plts += count_plts(syms, rels, numrels, + sechdrs[i].sh_info); } - mod->arch.plt->sh_type = SHT_NOBITS; - mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - mod->arch.plt->sh_addralign = L1_CACHE_BYTES; - mod->arch.plt->sh_size = plt_max_entries * sizeof(struct plt_entry); - mod->arch.plt_num_entries = 0; - mod->arch.plt_max_entries = plt_max_entries; + mod->arch.core.plt->sh_type = SHT_NOBITS; + mod->arch.core.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.core.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.core.plt->sh_size = (core_plts + 1) * sizeof(struct plt_entry); + mod->arch.core.plt_num_entries = 0; + mod->arch.core.plt_max_entries = core_plts; + + mod->arch.init.plt->sh_type = SHT_NOBITS; + mod->arch.init.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.init.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.init.plt->sh_size = (init_plts + 1) * sizeof(struct plt_entry); + mod->arch.init.plt_num_entries = 0; + mod->arch.init.plt_max_entries = init_plts; + return 0; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 7f316982ce00..c9a2ab446dc6 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -380,7 +380,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && ovf == -ERANGE) { - val = module_emit_plt_entry(me, &rel[i], sym); + val = module_emit_plt_entry(me, loc, &rel[i], sym); ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, AARCH64_INSN_IMM_26); } diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds index 8949f6c6f729..f7c9781a9d48 100644 --- a/arch/arm64/kernel/module.lds +++ b/arch/arm64/kernel/module.lds @@ -1,3 +1,4 @@ SECTIONS { .plt (NOLOAD) : { BYTE(0) } + .init.plt (NOLOAD) : { BYTE(0) } } diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index f7931d900bca..37e47a9d617e 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -157,10 +157,10 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) mod = __module_address((unsigned long)addr); if (mod && within_module_init((unsigned long)addr, mod) && !within_module_init((unsigned long)scan_end, mod)) - scan_end = (kprobe_opcode_t *)mod->module_init; + scan_end = (kprobe_opcode_t *)mod->init_layout.base; else if (mod && within_module_core((unsigned long)addr, mod) && !within_module_core((unsigned long)scan_end, mod)) - scan_end = (kprobe_opcode_t *)mod->module_core; + scan_end = (kprobe_opcode_t *)mod->core_layout.base; preempt_enable(); } #endif diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 1ee93c7c5a75..17a6449c96a2 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -31,7 +31,7 @@ #include <asm/insn.h> #include <asm/uaccess.h> #include <asm/irq.h> -#include <asm-generic/sections.h> +#include <asm/sections.h> #include "decode-insn.h" @@ -543,8 +543,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) bool arch_within_kprobe_blacklist(unsigned long addr) { - extern char __idmap_text_start[], __idmap_text_end[]; - if ((addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) || (addr >= (unsigned long)__entry_text_start && diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 80624829db61..e34bcf3f2c35 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -45,10 +45,14 @@ #include <linux/personality.h> #include <linux/notifier.h> #include <trace/events/power.h> +#ifdef CONFIG_THREAD_INFO_IN_TASK +#include <linux/percpu.h> +#endif #include <asm/alternative.h> #include <asm/compat.h> #include <asm/cacheflush.h> +#include <asm/exec.h> #include <asm/fpsimd.h> #include <asm/mmu_context.h> #include <asm/processor.h> @@ -165,6 +169,70 @@ void machine_restart(char *cmd) while (1); } +/* + * dump a block of kernel memory from around the given address + */ +static void show_data(unsigned long addr, int nbytes, const char *name) +{ + int i, j; + int nlines; + u32 *p; + + /* + * don't attempt to dump non-kernel addresses or + * values that are probably just small negative numbers + */ + if (addr < PAGE_OFFSET || addr > -256UL) + return; + + printk("\n%s: %#lx:\n", name, addr); + + /* + * round address down to a 32 bit boundary + * and always dump a multiple of 32 bytes + */ + p = (u32 *)(addr & ~(sizeof(u32) - 1)); + nbytes += (addr & (sizeof(u32) - 1)); + nlines = (nbytes + 31) / 32; + + + for (i = 0; i < nlines; i++) { + /* + * just display low 16 bits of address to keep + * each line of the dump < 80 characters + */ + printk("%04lx ", (unsigned long)p & 0xffff); + for (j = 0; j < 8; j++) { + u32 data; + if (probe_kernel_address(p, data)) { + printk(" ********"); + } else { + printk(" %08x", data); + } + ++p; + } + printk("\n"); + } +} + +static void show_extra_register_data(struct pt_regs *regs, int nbytes) +{ + mm_segment_t fs; + unsigned int i; + + fs = get_fs(); + set_fs(KERNEL_DS); + show_data(regs->pc - nbytes, nbytes * 2, "PC"); + show_data(regs->regs[30] - nbytes, nbytes * 2, "LR"); + show_data(regs->sp - nbytes, nbytes * 2, "SP"); + for (i = 0; i < 30; i++) { + char name[4]; + snprintf(name, sizeof(name), "X%u", i); + show_data(regs->regs[i] - nbytes, nbytes * 2, name); + } + set_fs(fs); +} + void __show_regs(struct pt_regs *regs) { int i, top_reg; @@ -191,6 +259,8 @@ void __show_regs(struct pt_regs *regs) if (i % 2 == 0) printk("\n"); } + if (!user_mode(regs)) + show_extra_register_data(regs, 128); printk("\n"); } @@ -313,7 +383,7 @@ static void tls_thread_switch(struct task_struct *next) } /* Restore the UAO state depending on next's addr_limit */ -static void uao_thread_switch(struct task_struct *next) +void uao_thread_switch(struct task_struct *next) { if (IS_ENABLED(CONFIG_ARM64_UAO)) { if (task_thread_info(next)->addr_limit == KERNEL_DS) @@ -323,6 +393,22 @@ static void uao_thread_switch(struct task_struct *next) } } +#ifdef CONFIG_THREAD_INFO_IN_TASK +/* + * We store our current task in sp_el0, which is clobbered by userspace. Keep a + * shadow copy so that we can restore this upon entry from userspace. + * + * This is *only* for exception entry from EL0, and is not valid until we + * __switch_to() a user task. + */ +DEFINE_PER_CPU(struct task_struct *, __entry_task); + +static void entry_task_switch(struct task_struct *next) +{ + __this_cpu_write(__entry_task, next); +} +#endif + /* * Thread switching. */ @@ -335,6 +421,9 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); +#ifdef CONFIG_THREAD_INFO_IN_TASK + entry_task_switch(next); +#endif uao_thread_switch(next); /* @@ -352,27 +441,35 @@ struct task_struct *__switch_to(struct task_struct *prev, unsigned long get_wchan(struct task_struct *p) { struct stackframe frame; - unsigned long stack_page; + unsigned long stack_page, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + stack_page = (unsigned long)try_get_task_stack(p); + if (!stack_page) + return 0; + frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif - stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || unwind_frame(p, &frame)) - return 0; - if (!in_sched_functions(frame.pc)) - return frame.pc; + goto out; + if (!in_sched_functions(frame.pc)) { + ret = frame.pc; + goto out; + } } while (count ++ < 16); - return 0; + +out: + put_task_stack(p); + return ret; } unsigned long arch_align_stack(unsigned long sp) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index c5ef05959813..6204b7600d1b 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -327,13 +327,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, disabled = !ctrl.enabled; + int err, len, type, offset, disabled = !ctrl.enabled; attr->disabled = disabled; if (disabled) return 0; - err = arch_bp_generic_fields(ctrl, &len, &type); + err = arch_bp_generic_fields(ctrl, &len, &type, &offset); if (err) return err; @@ -352,6 +352,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, attr->bp_len = len; attr->bp_type = type; + attr->bp_addr += offset; return 0; } @@ -404,7 +405,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type, if (IS_ERR(bp)) return PTR_ERR(bp); - *addr = bp ? bp->attr.bp_addr : 0; + *addr = bp ? counter_arch_bp(bp)->address : 0; return 0; } diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 1718706fde83..12a87f2600f2 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/ftrace.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> struct return_address_data { diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 566f3b9b1588..f1fae3cbc4ca 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -200,7 +200,7 @@ static void __init request_standard_resources(void) struct resource *res; kernel_code.start = virt_to_phys(_text); - kernel_code.end = virt_to_phys(_etext - 1); + kernel_code.end = virt_to_phys(__init_begin - 1); kernel_data.start = virt_to_phys(_sdata); kernel_data.end = virt_to_phys(_end - 1); @@ -351,6 +351,19 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Make sure thread_info.ttbr0 always generates translation + * faults in case uaccess_enable() is inadvertently called by the init + * thread. + */ +#ifdef CONFIG_THREAD_INFO_IN_TASK + init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#else + init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#endif +#endif + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index c2bf5a58039f..9696c5239a3a 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -97,6 +97,7 @@ ENTRY(__cpu_suspend_enter) ENDPROC(__cpu_suspend_enter) .ltorg + .pushsection ".idmap.text", "ax" ENTRY(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly /* enable the MMU early - so we can access sleep_save_stash by va */ @@ -124,9 +125,6 @@ ENTRY(_cpu_resume) /* load sp from context */ ldr x2, [x0, #CPU_CTX_SP] mov sp, x2 - /* save thread_info */ - and x2, x2, #~(THREAD_SIZE - 1) - msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context address pointer */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index a1d06fc42048..fb3a21e17bff 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -58,6 +58,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/ipi.h> +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -110,6 +113,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * We need to tell the secondary core where to find its stack and the * page tables. */ +#ifdef CONFIG_THREAD_INFO_IN_TASK + secondary_data.task = idle; +#endif secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -134,6 +140,9 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_err("CPU%u: failed to boot: %d\n", cpu, ret); } +#ifdef CONFIG_THREAD_INFO_IN_TASK + secondary_data.task = NULL; +#endif secondary_data.stack = NULL; status = READ_ONCE(secondary_data.status); if (ret && status) { @@ -177,7 +186,10 @@ static void smp_store_cpu_info(unsigned int cpuid) asmlinkage void secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; - unsigned int cpu = smp_processor_id(); + unsigned int cpu; + + cpu = task_cpu(current); + set_my_cpu_offset(per_cpu_offset(cpu)); /* * All kernel threads share the same mm context; grab a @@ -186,8 +198,6 @@ asmlinkage void secondary_start_kernel(void) atomic_inc(&mm->mm_count); current->active_mm = mm; - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. @@ -676,6 +686,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (max_cpus == 0) break; + per_cpu(cpu_number, cpu) = cpu; + if (cpu == smp_processor_id()) continue; diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index aef3605a8c47..4b4b0ad6ebde 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -29,7 +29,8 @@ #include <asm/smp_plat.h> extern void secondary_holding_pen(void); -volatile unsigned long secondary_holding_pen_release = INVALID_HWID; +volatile unsigned long __section(".mmuoff.data.read") +secondary_holding_pen_release = INVALID_HWID; static phys_addr_t cpu_release_addr[NR_CPUS]; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index a99eff9afc1f..95b915d17afc 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -22,6 +22,7 @@ #include <linux/stacktrace.h> #include <asm/irq.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> /* @@ -128,7 +129,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, break; } } -EXPORT_SYMBOL(walk_stackframe); #ifdef CONFIG_STACKTRACE struct stack_trace_data { @@ -160,6 +160,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) struct stack_trace_data data; struct stackframe frame; + if (!try_get_task_stack(tsk)) + return; + data.trace = trace; data.skip = trace->skip; @@ -181,6 +184,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; + + put_task_stack(tsk); } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 5a0b1088c17c..468b939f3471 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -5,6 +5,7 @@ #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/debug-monitors.h> +#include <asm/exec.h> #include <asm/pgtable.h> #include <asm/memory.h> #include <asm/mmu_context.h> @@ -44,12 +45,6 @@ void notrace __cpu_suspend_exit(void) cpu_uninstall_idmap(); /* - * Restore per-cpu offset before any kernel - * subsystem relying on it has a chance to run. - */ - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - - /* * Restore HW breakpoint registers to sane values * before debug exceptions are possibly reenabled * through local_dbg_restore. @@ -95,6 +90,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) */ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)); + uao_thread_switch(current); /* * Restore HW breakpoint registers to sane values diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..7758f7ff131b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,10 +19,30 @@ #include <linux/nodemask.h> #include <linux/of.h> #include <linux/sched.h> +#include <linux/sched.h> +#include <linux/sched_energy.h> #include <asm/cputype.h> #include <asm/topology.h> +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; + +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ +#ifdef CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else + return per_cpu(cpu_scale, cpu); +#endif +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -206,11 +226,72 @@ out: struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu); + return NULL; + } + + return sge; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for CPU%d\n", cpu); + return NULL; + } + + return sge; +} + const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; } +static int cpu_cpu_flags(void) +{ + return SD_ASYM_CPUCAPACITY; +} + +static inline int cpu_corepower_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; +} + +static struct sched_domain_topology_level arm64_topology[] = { +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, cpu_cpu_flags, cpu_cluster_energy, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static void update_cpu_capacity(unsigned int cpu) +{ + unsigned long capacity = SCHED_CAPACITY_SCALE; + + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); + + pr_info("CPU%d: update cpu_capacity %lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); +} + static void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -272,6 +353,7 @@ void store_cpu_topology(unsigned int cpuid) topology_populated: update_siblings_masks(cpuid); + update_cpu_capacity(cpuid); } static void __init reset_cpu_topology(void) @@ -302,4 +384,8 @@ void __init init_cpu_topology(void) */ if (of_have_populated_dt() && parse_dt_topology()) reset_cpu_topology(); + else + set_sched_topology(arm64_topology); + + init_sched_energy_costs(); } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index b62ec1e8a843..6543cd7bcb93 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -38,6 +38,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> #include <asm/system_misc.h> @@ -154,6 +155,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!tsk) tsk = current; + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); + + if (!tsk) + tsk = current; + + if (!try_get_task_stack(tsk)) + return; + /* * Switching between stacks is valid when tracing current and in * non-preemptible context. @@ -219,6 +228,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) stack + sizeof(struct pt_regs), false); } } + + put_task_stack(tsk); } void show_stack(struct task_struct *tsk, unsigned long *sp) @@ -234,10 +245,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) #endif #define S_SMP " SMP" -static int __die(const char *str, int err, struct thread_info *thread, - struct pt_regs *regs) +static int __die(const char *str, int err, struct pt_regs *regs) { - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; static int die_counter; int ret; @@ -252,7 +262,8 @@ static int __die(const char *str, int err, struct thread_info *thread, print_modules(); __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); + TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), + end_of_stack(tsk)); if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->sp, @@ -272,7 +283,6 @@ static DEFINE_RAW_SPINLOCK(die_lock); */ void die(const char *str, struct pt_regs *regs, int err) { - struct thread_info *thread = current_thread_info(); int ret; oops_enter(); @@ -280,9 +290,9 @@ void die(const char *str, struct pt_regs *regs, int err) raw_spin_lock_irq(&die_lock); console_verbose(); bust_spinlocks(1); - ret = __die(str, err, thread, regs); + ret = __die(str, err, regs); - if (regs && kexec_should_crash(thread->task)) + if (regs && kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); @@ -465,7 +475,7 @@ static const char *esr_class_str[] = { const char *esr_get_class_string(u32 esr) { - return esr_class_str[esr >> ESR_ELx_EC_SHIFT]; + return esr_class_str[ESR_ELx_EC(esr)]; } /* diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 97bc68f4c689..3b8acfae7797 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -55,7 +55,7 @@ struct vdso_data *vdso_data = &vdso_data_store.data; */ static struct page *vectors_page[1]; -static int alloc_vectors_page(void) +static int __init alloc_vectors_page(void) { extern char __kuser_helper_start[], __kuser_helper_end[]; extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; @@ -88,7 +88,7 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr = AARCH32_VECTORS_BASE; - static struct vm_special_mapping spec = { + static const struct vm_special_mapping spec = { .name = "[vectors]", .pages = vectors_page, @@ -212,10 +212,16 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; if (!use_syscall) { + /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; + vdso_data->raw_time_sec = tk->raw_time.tv_sec; + vdso_data->raw_time_nsec = tk->raw_time.tv_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - vdso_data->cs_mult = tk->tkr_mono.mult; + /* tkr_raw.xtime_nsec == 0 */ + vdso_data->cs_mono_mult = tk->tkr_mono.mult; + vdso_data->cs_raw_mult = tk->tkr_raw.mult; + /* tkr_mono.shift == tkr_raw.shift */ vdso_data->cs_shift = tk->tkr_mono.shift; } diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index b467fd0a384b..62c84f7cb01b 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -23,7 +23,7 @@ GCOV_PROFILE := n ccflags-y += -Wl,-shared obj-y += vdso.o -extra-y += vdso.lds vdso-offsets.h +extra-y += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) # Force dependency (incbin is bad) @@ -42,11 +42,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ define cmd_vdsosym - $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ && \ - cp $@ include/generated/ + $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ endef -$(obj)/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE +include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) # Assembly rules for the .S files diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index efa79e8d4196..e00b4671bd7c 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -26,24 +26,109 @@ #define NSEC_PER_SEC_HI16 0x3b9a vdso_data .req x6 -use_syscall .req w7 -seqcnt .req w8 +seqcnt .req w7 +w_tmp .req w8 +x_tmp .req x8 + +/* + * Conventions for macro arguments: + * - An argument is write-only if its name starts with "res". + * - All other arguments are read-only, unless otherwise specified. + */ .macro seqcnt_acquire 9999: ldr seqcnt, [vdso_data, #VDSO_TB_SEQ_COUNT] tbnz seqcnt, #0, 9999b dmb ishld - ldr use_syscall, [vdso_data, #VDSO_USE_SYSCALL] .endm - .macro seqcnt_read, cnt + .macro seqcnt_check fail dmb ishld - ldr \cnt, [vdso_data, #VDSO_TB_SEQ_COUNT] + ldr w_tmp, [vdso_data, #VDSO_TB_SEQ_COUNT] + cmp w_tmp, seqcnt + b.ne \fail .endm - .macro seqcnt_check, cnt, fail - cmp \cnt, seqcnt - b.ne \fail + .macro syscall_check fail + ldr w_tmp, [vdso_data, #VDSO_USE_SYSCALL] + cbnz w_tmp, \fail + .endm + + .macro get_nsec_per_sec res + mov \res, #NSEC_PER_SEC_LO16 + movk \res, #NSEC_PER_SEC_HI16, lsl #16 + .endm + + /* + * Returns the clock delta, in nanoseconds left-shifted by the clock + * shift. + */ + .macro get_clock_shifted_nsec res, cycle_last, mult + /* Read the virtual counter. */ + isb + mrs x_tmp, cntvct_el0 + /* Calculate cycle delta and convert to ns. */ + sub \res, x_tmp, \cycle_last + /* We can only guarantee 56 bits of precision. */ + movn x_tmp, #0xff00, lsl #48 + and \res, x_tmp, \res + mul \res, \res, \mult + .endm + + /* + * Returns in res_{sec,nsec} the REALTIME timespec, based on the + * "wall time" (xtime) and the clock_mono delta. + */ + .macro get_ts_realtime res_sec, res_nsec, \ + clock_nsec, xtime_sec, xtime_nsec, nsec_to_sec + add \res_nsec, \clock_nsec, \xtime_nsec + udiv x_tmp, \res_nsec, \nsec_to_sec + add \res_sec, \xtime_sec, x_tmp + msub \res_nsec, x_tmp, \nsec_to_sec, \res_nsec + .endm + + /* + * Returns in res_{sec,nsec} the timespec based on the clock_raw delta, + * used for CLOCK_MONOTONIC_RAW. + */ + .macro get_ts_clock_raw res_sec, res_nsec, clock_nsec, nsec_to_sec + udiv \res_sec, \clock_nsec, \nsec_to_sec + msub \res_nsec, \res_sec, \nsec_to_sec, \clock_nsec + .endm + + /* sec and nsec are modified in place. */ + .macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec + /* Add timespec. */ + add \sec, \sec, \ts_sec + add \nsec, \nsec, \ts_nsec + + /* Normalise the new timespec. */ + cmp \nsec, \nsec_to_sec + b.lt 9999f + sub \nsec, \nsec, \nsec_to_sec + add \sec, \sec, #1 +9999: + cmp \nsec, #0 + b.ge 9998f + add \nsec, \nsec, \nsec_to_sec + sub \sec, \sec, #1 +9998: + .endm + + .macro clock_gettime_return, shift=0 + .if \shift == 1 + lsr x11, x11, x12 + .endif + stp x10, x11, [x1, #TSPEC_TV_SEC] + mov x0, xzr + ret + .endm + + .macro jump_slot jumptable, index, label + .if (. - \jumptable) != 4 * (\index) + .error "Jump slot index mismatch" + .endif + b \label .endm .text @@ -51,18 +136,25 @@ seqcnt .req w8 /* int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); */ ENTRY(__kernel_gettimeofday) .cfi_startproc - mov x2, x30 - .cfi_register x30, x2 - - /* Acquire the sequence counter and get the timespec. */ adr vdso_data, _vdso_data -1: seqcnt_acquire - cbnz use_syscall, 4f - /* If tv is NULL, skip to the timezone code. */ cbz x0, 2f - bl __do_get_tspec - seqcnt_check w9, 1b + + /* Compute the time of day. */ +1: seqcnt_acquire + syscall_check fail=4f + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + /* w11 = cs_mono_mult, w12 = cs_shift */ + ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] + ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] + seqcnt_check fail=1b + + get_nsec_per_sec res=x9 + lsl x9, x9, x12 + + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_realtime res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 /* Convert ns to us. */ mov x13, #1000 @@ -76,95 +168,126 @@ ENTRY(__kernel_gettimeofday) stp w4, w5, [x1, #TZ_MINWEST] 3: mov x0, xzr - ret x2 + ret 4: /* Syscall fallback. */ mov x8, #__NR_gettimeofday svc #0 - ret x2 + ret .cfi_endproc ENDPROC(__kernel_gettimeofday) +#define JUMPSLOT_MAX CLOCK_MONOTONIC_COARSE + /* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */ ENTRY(__kernel_clock_gettime) .cfi_startproc - cmp w0, #CLOCK_REALTIME - ccmp w0, #CLOCK_MONOTONIC, #0x4, ne - b.ne 2f + cmp w0, #JUMPSLOT_MAX + b.hi syscall + adr vdso_data, _vdso_data + adr x_tmp, jumptable + add x_tmp, x_tmp, w0, uxtw #2 + br x_tmp + + ALIGN +jumptable: + jump_slot jumptable, CLOCK_REALTIME, realtime + jump_slot jumptable, CLOCK_MONOTONIC, monotonic + b syscall + b syscall + jump_slot jumptable, CLOCK_MONOTONIC_RAW, monotonic_raw + jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse + jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse + + .if (. - jumptable) != 4 * (JUMPSLOT_MAX + 1) + .error "Wrong jumptable size" + .endif + + ALIGN +realtime: + seqcnt_acquire + syscall_check fail=syscall + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + /* w11 = cs_mono_mult, w12 = cs_shift */ + ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] + ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] + seqcnt_check fail=realtime - mov x2, x30 - .cfi_register x30, x2 + /* All computations are done with left-shifted nsecs. */ + get_nsec_per_sec res=x9 + lsl x9, x9, x12 - /* Get kernel timespec. */ - adr vdso_data, _vdso_data -1: seqcnt_acquire - cbnz use_syscall, 7f + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_realtime res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 + clock_gettime_return, shift=1 - bl __do_get_tspec - seqcnt_check w9, 1b + ALIGN +monotonic: + seqcnt_acquire + syscall_check fail=syscall + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + /* w11 = cs_mono_mult, w12 = cs_shift */ + ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] + ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] + ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] + seqcnt_check fail=monotonic - mov x30, x2 + /* All computations are done with left-shifted nsecs. */ + lsl x4, x4, x12 + get_nsec_per_sec res=x9 + lsl x9, x9, x12 - cmp w0, #CLOCK_MONOTONIC - b.ne 6f + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_realtime res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 - /* Get wtm timespec. */ - ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC] + add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9 + clock_gettime_return, shift=1 - /* Check the sequence counter. */ - seqcnt_read w9 - seqcnt_check w9, 1b - b 4f -2: - cmp w0, #CLOCK_REALTIME_COARSE - ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne - b.ne 8f + ALIGN +monotonic_raw: + seqcnt_acquire + syscall_check fail=syscall + ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] + /* w11 = cs_raw_mult, w12 = cs_shift */ + ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT] + ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC] + seqcnt_check fail=monotonic_raw - /* xtime_coarse_nsec is already right-shifted */ - mov x12, #0 + /* All computations are done with left-shifted nsecs. */ + lsl x14, x14, x12 + get_nsec_per_sec res=x9 + lsl x9, x9, x12 - /* Get coarse timespec. */ - adr vdso_data, _vdso_data -3: seqcnt_acquire + get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + get_ts_clock_raw res_sec=x10, res_nsec=x11, \ + clock_nsec=x15, nsec_to_sec=x9 + + add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 + clock_gettime_return, shift=1 + + ALIGN +realtime_coarse: + seqcnt_acquire ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] + seqcnt_check fail=realtime_coarse + clock_gettime_return - /* Get wtm timespec. */ + ALIGN +monotonic_coarse: + seqcnt_acquire + ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC] ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC] + seqcnt_check fail=monotonic_coarse - /* Check the sequence counter. */ - seqcnt_read w9 - seqcnt_check w9, 3b + /* Computations are done in (non-shifted) nsecs. */ + get_nsec_per_sec res=x9 + add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9 + clock_gettime_return - cmp w0, #CLOCK_MONOTONIC_COARSE - b.ne 6f -4: - /* Add on wtm timespec. */ - add x10, x10, x13 - lsl x14, x14, x12 - add x11, x11, x14 - - /* Normalise the new timespec. */ - mov x15, #NSEC_PER_SEC_LO16 - movk x15, #NSEC_PER_SEC_HI16, lsl #16 - lsl x15, x15, x12 - cmp x11, x15 - b.lt 5f - sub x11, x11, x15 - add x10, x10, #1 -5: - cmp x11, #0 - b.ge 6f - add x11, x11, x15 - sub x10, x10, #1 - -6: /* Store to the user timespec. */ - lsr x11, x11, x12 - stp x10, x11, [x1, #TSPEC_TV_SEC] - mov x0, xzr - ret -7: - mov x30, x2 -8: /* Syscall fallback. */ + ALIGN +syscall: /* Syscall fallback. */ mov x8, #__NR_clock_gettime svc #0 ret @@ -176,6 +299,7 @@ ENTRY(__kernel_clock_getres) .cfi_startproc cmp w0, #CLOCK_REALTIME ccmp w0, #CLOCK_MONOTONIC, #0x4, ne + ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne b.ne 1f ldr x2, 5f @@ -203,46 +327,3 @@ ENTRY(__kernel_clock_getres) .quad CLOCK_COARSE_RES .cfi_endproc ENDPROC(__kernel_clock_getres) - -/* - * Read the current time from the architected counter. - * Expects vdso_data to be initialised. - * Clobbers the temporary registers (x9 - x15). - * Returns: - * - w9 = vDSO sequence counter - * - (x10, x11) = (ts->tv_sec, shifted ts->tv_nsec) - * - w12 = cs_shift - */ -ENTRY(__do_get_tspec) - .cfi_startproc - - /* Read from the vDSO data page. */ - ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST] - ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - ldp w11, w12, [vdso_data, #VDSO_CS_MULT] - seqcnt_read w9 - - /* Read the virtual counter. */ - isb - mrs x15, cntvct_el0 - - /* Calculate cycle delta and convert to ns. */ - sub x10, x15, x10 - /* We can only guarantee 56 bits of precision. */ - movn x15, #0xff00, lsl #48 - and x10, x15, x10 - mul x10, x10, x11 - - /* Use the kernel time to calculate the new timespec. */ - mov x11, #NSEC_PER_SEC_LO16 - movk x11, #NSEC_PER_SEC_HI16, lsl #16 - lsl x11, x11, x12 - add x15, x10, x14 - udiv x14, x15, x11 - add x10, x13, x14 - mul x13, x14, x11 - sub x11, x15, x13 - - ret - .cfi_endproc -ENDPROC(__do_get_tspec) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 623532f44323..f2d34d4c0a6b 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -133,12 +133,13 @@ SECTIONS } . = ALIGN(SEGMENT_ALIGN); - RO_DATA(PAGE_SIZE) /* everything from this point to */ - EXCEPTION_TABLE(8) /* _etext will be marked RO NX */ + _etext = .; /* End of text section */ + + RO_DATA(PAGE_SIZE) /* everything from this point to */ + EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */ NOTES . = ALIGN(SEGMENT_ALIGN); - _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) @@ -182,6 +183,25 @@ SECTIONS _data = .; _sdata = .; RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + + /* + * Data written with the MMU off but read with the MMU on requires + * cache lines to be invalidated, discarding up to a Cache Writeback + * Granule (CWG) of data from the cache. Keep the section that + * requires this type of maintenance to be in its own Cache Writeback + * Granule (CWG) area so the cache maintenance operations don't + * interfere with adjacent data. + */ + .mmuoff.data.write : ALIGN(SZ_2K) { + __mmuoff_data_start = .; + *(.mmuoff.data.write) + } + . = ALIGN(SZ_2K); + .mmuoff.data.read : { + *(.mmuoff.data.read) + __mmuoff_data_end = .; + } + PECOFF_EDATA_PADDING _edata = .; @@ -193,6 +213,11 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + reserved_ttbr0 = .; + . += RESERVED_TTBR0_SIZE; +#endif + _end = .; STABS_DEBUG |