aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-etb1069
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x13
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-stm53
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc77
-rw-r--r--Documentation/ABI/testing/sysfs-class-stm10
-rw-r--r--Documentation/arm64/booting.txt20
-rw-r--r--Documentation/arm64/silicon-errata.txt58
-rw-r--r--Documentation/cgroup-legacy/00-INDEX (renamed from Documentation/cgroups/00-INDEX)0
-rw-r--r--Documentation/cgroup-legacy/blkio-controller.txt (renamed from Documentation/cgroups/blkio-controller.txt)79
-rw-r--r--Documentation/cgroup-legacy/cgroups.txt (renamed from Documentation/cgroups/cgroups.txt)0
-rw-r--r--Documentation/cgroup-legacy/cpuacct.txt (renamed from Documentation/cgroups/cpuacct.txt)0
-rw-r--r--Documentation/cgroup-legacy/cpusets.txt (renamed from Documentation/cgroups/cpusets.txt)0
-rw-r--r--Documentation/cgroup-legacy/devices.txt (renamed from Documentation/cgroups/devices.txt)0
-rw-r--r--Documentation/cgroup-legacy/freezer-subsystem.txt (renamed from Documentation/cgroups/freezer-subsystem.txt)0
-rw-r--r--Documentation/cgroup-legacy/hugetlb.txt (renamed from Documentation/cgroups/hugetlb.txt)0
-rw-r--r--Documentation/cgroup-legacy/memcg_test.txt (renamed from Documentation/cgroups/memcg_test.txt)0
-rw-r--r--Documentation/cgroup-legacy/memory.txt (renamed from Documentation/cgroups/memory.txt)0
-rw-r--r--Documentation/cgroup-legacy/net_cls.txt (renamed from Documentation/cgroups/net_cls.txt)0
-rw-r--r--Documentation/cgroup-legacy/net_prio.txt (renamed from Documentation/cgroups/net_prio.txt)0
-rw-r--r--Documentation/cgroup-legacy/pids.txt (renamed from Documentation/cgroups/pids.txt)0
-rw-r--r--Documentation/cgroup.txt1293
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt647
-rw-r--r--Documentation/devicetree/bindings/opp/opp.txt132
-rw-r--r--Documentation/features/time/irq-time-acct/arch-support.txt2
-rw-r--r--Documentation/features/vm/huge-vmap/arch-support.txt2
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/trace/coresight.txt37
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/Kconfig9
-rw-r--r--arch/arm/Kconfig10
-rw-r--r--arch/arm/include/asm/cacheflush.h1
-rw-r--r--arch/arm/include/asm/cpuidle.h2
-rw-r--r--arch/arm/include/asm/kvm_arm.h34
-rw-r--r--arch/arm/include/asm/kvm_asm.h2
-rw-r--r--arch/arm/include/asm/kvm_host.h14
-rw-r--r--arch/arm/include/asm/kvm_mmu.h6
-rw-r--r--arch/arm/include/asm/ptrace.h1
-rw-r--r--arch/arm/include/asm/uaccess.h11
-rw-r--r--arch/arm/include/asm/virt.h9
-rw-r--r--arch/arm/kernel/Makefile3
-rw-r--r--arch/arm/kernel/armksyms.c6
-rw-r--r--arch/arm/kernel/cpuidle.c2
-rw-r--r--arch/arm/kernel/psci-call.S31
-rw-r--r--arch/arm/kernel/smccc-call.S62
-rw-r--r--arch/arm/kvm/arm.c286
-rw-r--r--arch/arm/kvm/emulate.c74
-rw-r--r--arch/arm/kvm/mmu.c18
-rw-r--r--arch/arm/vdso/vdso.S3
-rw-r--r--arch/arm64/Kconfig106
-rw-r--r--arch/arm64/Kconfig.debug2
-rw-r--r--arch/arm64/Makefile11
-rw-r--r--arch/arm64/boot/dts/arm/juno.dts305
-rw-r--r--arch/arm64/configs/defconfig3
-rw-r--r--arch/arm64/include/asm/acpi.h19
-rw-r--r--arch/arm64/include/asm/alternative.h64
-rw-r--r--arch/arm64/include/asm/assembler.h158
-rw-r--r--arch/arm64/include/asm/atomic_lse.h38
-rw-r--r--arch/arm64/include/asm/boot.h6
-rw-r--r--arch/arm64/include/asm/brk-imm.h25
-rw-r--r--arch/arm64/include/asm/bug.h2
-rw-r--r--arch/arm64/include/asm/cacheflush.h5
-rw-r--r--arch/arm64/include/asm/cmpxchg.h1
-rw-r--r--arch/arm64/include/asm/cpu.h1
-rw-r--r--arch/arm64/include/asm/cpufeature.h10
-rw-r--r--arch/arm64/include/asm/cputype.h37
-rw-r--r--arch/arm64/include/asm/debug-monitors.h19
-rw-r--r--arch/arm64/include/asm/elf.h24
-rw-r--r--arch/arm64/include/asm/fixmap.h10
-rw-r--r--arch/arm64/include/asm/ftrace.h2
-rw-r--r--arch/arm64/include/asm/futex.h12
-rw-r--r--arch/arm64/include/asm/hardirq.h2
-rw-r--r--arch/arm64/include/asm/hugetlb.h44
-rw-r--r--arch/arm64/include/asm/insn.h41
-rw-r--r--arch/arm64/include/asm/irq.h45
-rw-r--r--arch/arm64/include/asm/kasan.h5
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h12
-rw-r--r--arch/arm64/include/asm/kprobes.h60
-rw-r--r--arch/arm64/include/asm/kvm_arm.h14
-rw-r--r--arch/arm64/include/asm/kvm_asm.h96
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h1
-rw-r--r--arch/arm64/include/asm/kvm_host.h106
-rw-r--r--arch/arm64/include/asm/kvm_mmio.h1
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h10
-rw-r--r--arch/arm64/include/asm/lse.h1
-rw-r--r--arch/arm64/include/asm/memory.h68
-rw-r--r--arch/arm64/include/asm/mmu_context.h62
-rw-r--r--arch/arm64/include/asm/module.h17
-rw-r--r--arch/arm64/include/asm/page.h2
-rw-r--r--arch/arm64/include/asm/pgalloc.h26
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h18
-rw-r--r--arch/arm64/include/asm/pgtable.h134
-rw-r--r--arch/arm64/include/asm/probes.h35
-rw-r--r--arch/arm64/include/asm/processor.h9
-rw-r--r--arch/arm64/include/asm/ptrace.h61
-rw-r--r--arch/arm64/include/asm/shmparam.h2
-rw-r--r--arch/arm64/include/asm/smp.h9
-rw-r--r--arch/arm64/include/asm/spinlock.h23
-rw-r--r--arch/arm64/include/asm/stacktrace.h9
-rw-r--r--arch/arm64/include/asm/suspend.h32
-rw-r--r--arch/arm64/include/asm/sysreg.h63
-rw-r--r--arch/arm64/include/asm/thread_info.h10
-rw-r--r--arch/arm64/include/asm/uaccess.h111
-rw-r--r--arch/arm64/include/asm/virt.h26
-rw-r--r--arch/arm64/include/asm/word-at-a-time.h7
-rw-r--r--arch/arm64/include/uapi/asm/ptrace.h1
-rw-r--r--arch/arm64/kernel/Makefile14
-rw-r--r--arch/arm64/kernel/acpi_parking_protocol.c153
-rw-r--r--arch/arm64/kernel/alternative.c6
-rw-r--r--arch/arm64/kernel/arm64ksyms.c11
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c32
-rw-r--r--arch/arm64/kernel/asm-offsets.c64
-rw-r--r--arch/arm64/kernel/cpu_errata.c18
-rw-r--r--arch/arm64/kernel/cpu_ops.c27
-rw-r--r--arch/arm64/kernel/cpufeature.c144
-rw-r--r--arch/arm64/kernel/cpuinfo.c55
-rw-r--r--arch/arm64/kernel/debug-monitors.c33
-rw-r--r--arch/arm64/kernel/efi-entry.S2
-rw-r--r--arch/arm64/kernel/entry.S72
-rw-r--r--arch/arm64/kernel/fpsimd.c2
-rw-r--r--arch/arm64/kernel/ftrace.c27
-rw-r--r--arch/arm64/kernel/head.S194
-rw-r--r--arch/arm64/kernel/hibernate-asm.S176
-rw-r--r--arch/arm64/kernel/hibernate.c487
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c8
-rw-r--r--arch/arm64/kernel/hyp-stub.S45
-rw-r--r--arch/arm64/kernel/image.h87
-rw-r--r--arch/arm64/kernel/insn.c133
-rw-r--r--arch/arm64/kernel/irq.c3
-rw-r--r--arch/arm64/kernel/kaslr.c177
-rw-r--r--arch/arm64/kernel/kgdb.c4
-rw-r--r--arch/arm64/kernel/module-plts.c201
-rw-r--r--arch/arm64/kernel/module.c96
-rw-r--r--arch/arm64/kernel/module.lds3
-rw-r--r--arch/arm64/kernel/perf_callchain.c5
-rw-r--r--arch/arm64/kernel/probes/Makefile3
-rw-r--r--arch/arm64/kernel/probes/decode-insn.c174
-rw-r--r--arch/arm64/kernel/probes/decode-insn.h35
-rw-r--r--arch/arm64/kernel/probes/kprobes.c657
-rw-r--r--arch/arm64/kernel/probes/kprobes_trampoline.S81
-rw-r--r--arch/arm64/kernel/probes/simulate-insn.c217
-rw-r--r--arch/arm64/kernel/probes/simulate-insn.h28
-rw-r--r--arch/arm64/kernel/process.c21
-rw-r--r--arch/arm64/kernel/psci.c99
-rw-r--r--arch/arm64/kernel/ptrace.c100
-rw-r--r--arch/arm64/kernel/return_address.c5
-rw-r--r--arch/arm64/kernel/setup.c37
-rw-r--r--arch/arm64/kernel/sleep.S151
-rw-r--r--arch/arm64/kernel/smccc-call.S43
-rw-r--r--arch/arm64/kernel/smp.c32
-rw-r--r--arch/arm64/kernel/stacktrace.c74
-rw-r--r--arch/arm64/kernel/suspend.c106
-rw-r--r--arch/arm64/kernel/time.c5
-rw-r--r--arch/arm64/kernel/traps.c61
-rw-r--r--arch/arm64/kernel/vdso/vdso.S3
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S68
-rw-r--r--arch/arm64/kvm/Makefile3
-rw-r--r--arch/arm64/kvm/guest.c1
-rw-r--r--arch/arm64/kvm/handle_exit.c8
-rw-r--r--arch/arm64/kvm/hyp-init.S51
-rw-r--r--arch/arm64/kvm/hyp.S1095
-rw-r--r--arch/arm64/kvm/hyp/Makefile14
-rw-r--r--arch/arm64/kvm/hyp/debug-sr.c141
-rw-r--r--arch/arm64/kvm/hyp/entry.S160
-rw-r--r--arch/arm64/kvm/hyp/fpsimd.S (renamed from arch/arm64/kernel/psci-call.S)27
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S232
-rw-r--r--arch/arm64/kvm/hyp/hyp.h90
-rw-r--r--arch/arm64/kvm/hyp/switch.c175
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c138
-rw-r--r--arch/arm64/kvm/hyp/timer-sr.c71
-rw-r--r--arch/arm64/kvm/hyp/tlb.c80
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-sr.c84
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c228
-rw-r--r--arch/arm64/kvm/reset.c14
-rw-r--r--arch/arm64/kvm/sys_regs.c59
-rw-r--r--arch/arm64/kvm/vgic-v2-switch.S134
-rw-r--r--arch/arm64/kvm/vgic-v3-switch.S269
-rw-r--r--arch/arm64/lib/Makefile13
-rw-r--r--arch/arm64/lib/clear_user.S12
-rw-r--r--arch/arm64/lib/copy_from_user.S16
-rw-r--r--arch/arm64/lib/copy_in_user.S20
-rw-r--r--arch/arm64/lib/copy_page.S63
-rw-r--r--arch/arm64/lib/copy_to_user.S16
-rw-r--r--arch/arm64/mm/cache.S30
-rw-r--r--arch/arm64/mm/context.c2
-rw-r--r--arch/arm64/mm/copypage.c3
-rw-r--r--arch/arm64/mm/dma-mapping.c4
-rw-r--r--arch/arm64/mm/dump.c17
-rw-r--r--arch/arm64/mm/extable.c2
-rw-r--r--arch/arm64/mm/fault.c60
-rw-r--r--arch/arm64/mm/flush.c33
-rw-r--r--arch/arm64/mm/hugetlbpage.c260
-rw-r--r--arch/arm64/mm/init.c135
-rw-r--r--arch/arm64/mm/kasan_init.c79
-rw-r--r--arch/arm64/mm/mmu.c540
-rw-r--r--arch/arm64/mm/pageattr.c69
-rw-r--r--arch/arm64/mm/pgd.c12
-rw-r--r--arch/arm64/mm/proc-macros.S22
-rw-r--r--arch/arm64/mm/proc.S84
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/include/asm/uaccess.h10
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/parisc/include/asm/assembly.h2
-rw-r--r--arch/parisc/include/asm/cache.h3
-rw-r--r--arch/parisc/include/asm/cacheflush.h4
-rw-r--r--arch/parisc/include/asm/uaccess.h7
-rw-r--r--arch/parisc/mm/fault.c9
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/uaccess.h14
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/lib/uaccess.c2
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/uaccess_32.h9
-rw-r--r--arch/sparc/include/asm/uaccess_64.h4
-rw-r--r--arch/x86/Kconfig6
-rw-r--r--arch/x86/Kconfig.debug18
-rw-r--r--arch/x86/entry/vdso/vdso2c.h2
-rw-r--r--arch/x86/include/asm/cacheflush.h6
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/kvm_para.h7
-rw-r--r--arch/x86/include/asm/sections.h2
-rw-r--r--arch/x86/include/asm/thread_info.h44
-rw-r--r--arch/x86/include/asm/uaccess.h88
-rw-r--r--arch/x86/include/asm/uaccess_32.h62
-rw-r--r--arch/x86/include/asm/uaccess_64.h96
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S25
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--backported-features14
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--drivers/base/power/opp/Makefile1
-rw-r--r--drivers/base/power/opp/core.c1229
-rw-r--r--drivers/base/power/opp/cpu.c25
-rw-r--r--drivers/base/power/opp/debugfs.c218
-rw-r--r--drivers/base/power/opp/opp.h107
-rw-r--r--drivers/cpufreq/cpufreq-dt.c309
-rw-r--r--drivers/firmware/Kconfig3
-rw-r--r--drivers/firmware/efi/libstub/Makefile7
-rw-r--r--drivers/firmware/efi/libstub/arm-stub.c40
-rw-r--r--drivers/firmware/efi/libstub/arm64-stub.c87
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c7
-rw-r--r--drivers/firmware/efi/libstub/efistub.h7
-rw-r--r--drivers/firmware/efi/libstub/fdt.c14
-rw-r--r--drivers/firmware/efi/libstub/random.c135
-rw-r--r--drivers/firmware/psci.c143
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c4
-rw-r--r--drivers/hwtracing/coresight/Kconfig14
-rw-r--r--drivers/hwtracing/coresight/Makefile13
-rw-r--r--drivers/hwtracing/coresight/coresight-etb10.c356
-rw-r--r--drivers/hwtracing/coresight/coresight-etm-perf.c510
-rw-r--r--drivers/hwtracing/coresight/coresight-etm-perf.h32
-rw-r--r--drivers/hwtracing/coresight/coresight-etm.h142
-rw-r--r--drivers/hwtracing/coresight/coresight-etm3x-sysfs.c1265
-rw-r--r--drivers/hwtracing/coresight/coresight-etm3x.c1687
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x-sysfs.c2126
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x.c2431
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x.h222
-rw-r--r--drivers/hwtracing/coresight/coresight-funnel.c22
-rw-r--r--drivers/hwtracing/coresight/coresight-priv.h46
-rw-r--r--drivers/hwtracing/coresight/coresight-replicator-qcom.c19
-rw-r--r--drivers/hwtracing/coresight/coresight-replicator.c26
-rw-r--r--drivers/hwtracing/coresight/coresight-stm.c920
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc-etf.c604
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc-etr.c569
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc.c635
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc.h140
-rw-r--r--drivers/hwtracing/coresight/coresight-tpiu.c24
-rw-r--r--drivers/hwtracing/coresight/coresight.c503
-rw-r--r--drivers/hwtracing/coresight/of_coresight.c3
-rw-r--r--drivers/hwtracing/stm/Kconfig15
-rw-r--r--drivers/hwtracing/stm/Makefile2
-rw-r--r--drivers/hwtracing/stm/core.c203
-rw-r--r--drivers/hwtracing/stm/dummy_stm.c67
-rw-r--r--drivers/hwtracing/stm/heartbeat.c126
-rw-r--r--drivers/hwtracing/stm/policy.c27
-rw-r--r--drivers/hwtracing/stm/stm.h2
-rw-r--r--drivers/misc/lkdtm.c29
-rw-r--r--drivers/of/fdt.c19
-rw-r--r--drivers/soc/qcom/spm.c10
-rw-r--r--fs/inode.c6
-rw-r--r--include/asm-generic/fixmap.h12
-rw-r--r--include/asm-generic/vmlinux.lds.h1
-rw-r--r--include/clocksource/arm_arch_timer.h6
-rw-r--r--include/kvm/arm_vgic.h6
-rw-r--r--include/linux/amba/bus.h9
-rw-r--r--include/linux/arm-smccc.h104
-rw-r--r--include/linux/cache.h14
-rw-r--r--include/linux/cgroup-defs.h1
-rw-r--r--include/linux/coresight-pmu.h39
-rw-r--r--include/linux/coresight-stm.h6
-rw-r--r--include/linux/coresight.h34
-rw-r--r--include/linux/efi.h6
-rw-r--r--include/linux/hugetlb.h2
-rw-r--r--include/linux/init.h4
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/perf_event.h11
-rw-r--r--include/linux/pm_opp.h49
-rw-r--r--include/linux/psci.h3
-rw-r--r--include/linux/slab.h12
-rw-r--r--include/linux/slub_def.h1
-rw-r--r--include/linux/stm.h13
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/thread_info.h25
-rw-r--r--include/linux/uaccess.h7
-rw-r--r--include/uapi/linux/coresight-stm.h21
-rw-r--r--include/uapi/linux/magic.h1
-rw-r--r--include/uapi/linux/perf_event.h1
-rw-r--r--init/Kconfig2
-rw-r--r--init/main.c27
-rw-r--r--kernel/cgroup.c47
-rw-r--r--kernel/debug/kdb/kdb_bp.c4
-rw-r--r--kernel/events/core.c140
-rw-r--r--kernel/events/internal.h1
-rw-r--r--kernel/events/ring_buffer.c94
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/swap.c18
-rw-r--r--lib/extable.c50
-rw-r--r--lib/strncpy_from_user.c15
-rw-r--r--lib/strnlen_user.c21
-rw-r--r--mm/Makefile4
-rw-r--r--mm/maccess.c3
-rw-r--r--mm/page-writeback.c14
-rw-r--r--mm/page_alloc.c21
-rw-r--r--mm/slab.c30
-rw-r--r--mm/slub.c140
-rw-r--r--mm/usercopy.c277
-rw-r--r--samples/kprobes/kprobe_example.c9
-rw-r--r--scripts/sortextable.c11
-rw-r--r--security/Kconfig39
-rw-r--r--tools/perf/MANIFEST1
-rw-r--r--tools/perf/Makefile.perf3
-rw-r--r--tools/perf/arch/arm/util/Build2
-rw-r--r--tools/perf/arch/arm/util/auxtrace.c54
-rw-r--r--tools/perf/arch/arm/util/cs-etm.c563
-rw-r--r--tools/perf/arch/arm/util/cs-etm.h23
-rw-r--r--tools/perf/arch/arm/util/pmu.c34
-rw-r--r--tools/perf/arch/arm64/util/Build4
-rw-r--r--tools/perf/arch/x86/util/intel-bts.c4
-rw-r--r--tools/perf/arch/x86/util/intel-pt.c4
-rw-r--r--tools/perf/builtin-inject.c12
-rw-r--r--tools/perf/builtin-record.c9
-rw-r--r--tools/perf/builtin-script.c3
-rw-r--r--tools/perf/config/Makefile29
-rw-r--r--tools/perf/scripts/python/cs-trace-disasm.py124
-rw-r--r--tools/perf/scripts/python/cs-trace-ranges.py44
-rw-r--r--tools/perf/util/Build2
-rw-r--r--tools/perf/util/auxtrace.c10
-rw-r--r--tools/perf/util/auxtrace.h7
-rw-r--r--tools/perf/util/build-id.c2
-rw-r--r--tools/perf/util/build-id.h1
-rw-r--r--tools/perf/util/cpumap.c30
-rw-r--r--tools/perf/util/cpumap.h32
-rw-r--r--tools/perf/util/cs-etm-decoder/Build7
-rw-r--r--tools/perf/util/cs-etm-decoder/cs-etm-decoder-stub.c91
-rw-r--r--tools/perf/util/cs-etm-decoder/cs-etm-decoder.c503
-rw-r--r--tools/perf/util/cs-etm-decoder/cs-etm-decoder.h118
-rw-r--r--tools/perf/util/cs-etm.c1533
-rw-r--r--tools/perf/util/cs-etm.h84
-rw-r--r--tools/perf/util/evlist.c26
-rw-r--r--tools/perf/util/evlist.h3
-rw-r--r--tools/perf/util/evsel.c43
-rw-r--r--tools/perf/util/evsel.h8
-rw-r--r--tools/perf/util/machine.c35
-rw-r--r--tools/perf/util/parse-events.c67
-rw-r--r--tools/perf/util/parse-events.h3
-rw-r--r--tools/perf/util/parse-events.l12
-rw-r--r--tools/perf/util/parse-events.y11
-rw-r--r--tools/perf/util/scripting-engines/trace-event-python.c2
-rw-r--r--tools/perf/util/session.c40
-rw-r--r--tools/perf/util/symbol-minimal.c2
-rw-r--r--tools/perf/util/symbol.c3
-rw-r--r--virt/kvm/arm/vgic-v3.c11
379 files changed, 25187 insertions, 9387 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10 b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10
index 4b8d6ec92e2b..b5f526081711 100644
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10
@@ -6,13 +6,6 @@ Description: (RW) Add/remove a sink from a trace path. There can be multiple
source for a single sink.
ex: echo 1 > /sys/bus/coresight/devices/20010000.etb/enable_sink
-What: /sys/bus/coresight/devices/<memory_map>.etb/status
-Date: November 2014
-KernelVersion: 3.19
-Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
-Description: (R) List various control and status registers. The specific
- layout and content is driver specific.
-
What: /sys/bus/coresight/devices/<memory_map>.etb/trigger_cntr
Date: November 2014
KernelVersion: 3.19
@@ -22,3 +15,65 @@ Description: (RW) Disables write access to the Trace RAM by stopping the
following the trigger event. The number of 32-bit words written
into the Trace RAM following the trigger event is equal to the
value stored in this register+1 (from ARM ETB-TRM).
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/rdp
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Defines the depth, in words, of the trace RAM in powers of
+ 2. The value is read directly from HW register RDP, 0x004.
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/sts
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the ETB status register. The value
+ is read directly from HW register STS, 0x00C.
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/rrp
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the ETB RAM Read Pointer register
+ that is used to read entries from the Trace RAM over the APB
+ interface. The value is read directly from HW register RRP,
+ 0x014.
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/rwp
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the ETB RAM Write Pointer register
+ that is used to sets the write pointer to write entries from
+ the CoreSight bus into the Trace RAM. The value is read directly
+ from HW register RWP, 0x018.
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/trg
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Similar to "trigger_cntr" above except that this value is
+ read directly from HW register TRG, 0x01C.
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/ctl
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the ETB Control register. The value
+ is read directly from HW register CTL, 0x020.
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/ffsr
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the ETB Formatter and Flush Status
+ register. The value is read directly from HW register FFSR,
+ 0x300.
+
+What: /sys/bus/coresight/devices/<memory_map>.etb/mgmt/ffcr
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the ETB Formatter and Flush Control
+ register. The value is read directly from HW register FFCR,
+ 0x304.
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
index 2355ed8ae31f..36258bc1b473 100644
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
@@ -359,6 +359,19 @@ Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
Description: (R) Print the content of the Peripheral ID3 Register
(0xFEC). The value is taken directly from the HW.
+What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcconfig
+Date: February 2016
+KernelVersion: 4.07
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Print the content of the trace configuration register
+ (0x010) as currently set by SW.
+
+What: /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trctraceid
+Date: February 2016
+KernelVersion: 4.07
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Print the content of the trace ID register (0x040).
+
What: /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr0
Date: April 2015
KernelVersion: 4.01
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
new file mode 100644
index 000000000000..1dffabe7f48d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
@@ -0,0 +1,53 @@
+What: /sys/bus/coresight/devices/<memory_map>.stm/enable_source
+Date: April 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (RW) Enable/disable tracing on this specific trace macrocell.
+ Enabling the trace macrocell implies it has been configured
+ properly and a sink has been identified for it. The path
+ of coresight components linking the source to the sink is
+ configured and managed automatically by the coresight framework.
+
+What: /sys/bus/coresight/devices/<memory_map>.stm/hwevent_enable
+Date: April 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (RW) Provides access to the HW event enable register, used in
+ conjunction with HW event bank select register.
+
+What: /sys/bus/coresight/devices/<memory_map>.stm/hwevent_select
+Date: April 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (RW) Gives access to the HW event block select register
+ (STMHEBSR) in order to configure up to 256 channels. Used in
+ conjunction with "hwevent_enable" register as described above.
+
+What: /sys/bus/coresight/devices/<memory_map>.stm/port_enable
+Date: April 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (RW) Provides access to the stimulus port enable register
+ (STMSPER). Used in conjunction with "port_select" described
+ below.
+
+What: /sys/bus/coresight/devices/<memory_map>.stm/port_select
+Date: April 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (RW) Used to determine which bank of stimulus port bit in
+ register STMSPER (see above) apply to.
+
+What: /sys/bus/coresight/devices/<memory_map>.stm/status
+Date: April 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) List various control and status registers. The specific
+ layout and content is driver specific.
+
+What: /sys/bus/coresight/devices/<memory_map>.stm/traceid
+Date: April 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (RW) Holds the trace ID that will appear in the trace stream
+ coming from this trace entity.
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc
index f38cded5fa22..4fe677ed1305 100644
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc
@@ -6,3 +6,80 @@ Description: (RW) Disables write access to the Trace RAM by stopping the
formatter after a defined number of words have been stored
following the trigger event. Additional interface for this
driver are expected to be added as it matures.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/rsz
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Defines the size, in 32-bit words, of the local RAM buffer.
+ The value is read directly from HW register RSZ, 0x004.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/sts
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the TMC status register. The value
+ is read directly from HW register STS, 0x00C.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/rrp
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the TMC RAM Read Pointer register
+ that is used to read entries from the Trace RAM over the APB
+ interface. The value is read directly from HW register RRP,
+ 0x014.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/rwp
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the TMC RAM Write Pointer register
+ that is used to sets the write pointer to write entries from
+ the CoreSight bus into the Trace RAM. The value is read directly
+ from HW register RWP, 0x018.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/trg
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Similar to "trigger_cntr" above except that this value is
+ read directly from HW register TRG, 0x01C.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/ctl
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the TMC Control register. The value
+ is read directly from HW register CTL, 0x020.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/ffsr
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the TMC Formatter and Flush Status
+ register. The value is read directly from HW register FFSR,
+ 0x300.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/ffcr
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the TMC Formatter and Flush Control
+ register. The value is read directly from HW register FFCR,
+ 0x304.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/mode
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Shows the value held by the TMC Mode register, which
+ indicate the mode the device has been configured to enact. The
+ The value is read directly from the MODE register, 0x028.
+
+What: /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/devid
+Date: March 2016
+KernelVersion: 4.7
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
+Description: (R) Indicates the capabilities of the Coresight TMC.
+ The value is read directly from the DEVID register, 0xFC8,
diff --git a/Documentation/ABI/testing/sysfs-class-stm b/Documentation/ABI/testing/sysfs-class-stm
index c9aa4f3fc9a7..77ed3da0f68e 100644
--- a/Documentation/ABI/testing/sysfs-class-stm
+++ b/Documentation/ABI/testing/sysfs-class-stm
@@ -12,3 +12,13 @@ KernelVersion: 4.3
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Description:
Shows the number of channels per master on this STM device.
+
+What: /sys/class/stm/<stm>/hw_override
+Date: March 2016
+KernelVersion: 4.7
+Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Description:
+ Reads as 0 if master numbers in the STP stream produced by
+ this stm device will match the master numbers assigned by
+ the software or 1 if the stm hardware overrides software
+ assigned masters.
diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt
index 701d39d3171a..56d6d8b796db 100644
--- a/Documentation/arm64/booting.txt
+++ b/Documentation/arm64/booting.txt
@@ -109,7 +109,13 @@ Header notes:
1 - 4K
2 - 16K
3 - 64K
- Bits 3-63: Reserved.
+ Bit 3: Kernel physical placement
+ 0 - 2MB aligned base should be as close as possible
+ to the base of DRAM, since memory below it is not
+ accessible via the linear mapping
+ 1 - 2MB aligned base may be anywhere in physical
+ memory
+ Bits 4-63: Reserved.
- When image_size is zero, a bootloader should attempt to keep as much
memory as possible free for use by the kernel immediately after the
@@ -117,14 +123,14 @@ Header notes:
depending on selected features, and is effectively unbound.
The Image must be placed text_offset bytes from a 2MB aligned base
-address near the start of usable system RAM and called there. Memory
-below that base address is currently unusable by Linux, and therefore it
-is strongly recommended that this location is the start of system RAM.
-The region between the 2 MB aligned base address and the start of the
-image has no special significance to the kernel, and may be used for
-other purposes.
+address anywhere in usable system RAM and called there. The region
+between the 2 MB aligned base address and the start of the image has no
+special significance to the kernel, and may be used for other purposes.
At least image_size bytes from the start of the image must be free for
use by the kernel.
+NOTE: versions prior to v4.6 cannot make use of memory below the
+physical offset of the Image so it is recommended that the Image be
+placed as close as possible to the start of system RAM.
Any memory described to the kernel (even that below the start of the
image) which is not marked as reserved from the kernel (e.g., with a
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
new file mode 100644
index 000000000000..58b71ddf9b60
--- /dev/null
+++ b/Documentation/arm64/silicon-errata.txt
@@ -0,0 +1,58 @@
+ Silicon Errata and Software Workarounds
+ =======================================
+
+Author: Will Deacon <will.deacon@arm.com>
+Date : 27 November 2015
+
+It is an unfortunate fact of life that hardware is often produced with
+so-called "errata", which can cause it to deviate from the architecture
+under specific circumstances. For hardware produced by ARM, these
+errata are broadly classified into the following categories:
+
+ Category A: A critical error without a viable workaround.
+ Category B: A significant or critical error with an acceptable
+ workaround.
+ Category C: A minor error that is not expected to occur under normal
+ operation.
+
+For more information, consult one of the "Software Developers Errata
+Notice" documents available on infocenter.arm.com (registration
+required).
+
+As far as Linux is concerned, Category B errata may require some special
+treatment in the operating system. For example, avoiding a particular
+sequence of code, or configuring the processor in a particular way. A
+less common situation may require similar actions in order to declassify
+a Category A erratum into a Category C erratum. These are collectively
+known as "software workarounds" and are only required in the minority of
+cases (e.g. those cases that both require a non-secure workaround *and*
+can be triggered by Linux).
+
+For software workarounds that may adversely impact systems unaffected by
+the erratum in question, a Kconfig entry is added under "Kernel
+Features" -> "ARM errata workarounds via the alternatives framework".
+These are enabled by default and patched in at runtime when an affected
+CPU is detected. For less-intrusive workarounds, a Kconfig option is not
+available and the code is structured (preferably with a comment) in such
+a way that the erratum will not be hit.
+
+This approach can make it slightly onerous to determine exactly which
+errata are worked around in an arbitrary kernel source tree, so this
+file acts as a registry of software workarounds in the Linux Kernel and
+will be updated when new workarounds are committed and backported to
+stable kernels.
+
+| Implementor | Component | Erratum ID | Kconfig |
++----------------+-----------------+-----------------+-------------------------+
+| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 |
+| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 |
+| ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 |
+| ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 |
+| ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 |
+| ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 |
+| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 |
+| ARM | Cortex-A57 | #852523 | N/A |
+| ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 |
+| | | | |
+| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
+| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroup-legacy/00-INDEX
index 3f5a40f57d4a..3f5a40f57d4a 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroup-legacy/00-INDEX
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroup-legacy/blkio-controller.txt
index 52fa9f353342..4ecc954a3063 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroup-legacy/blkio-controller.txt
@@ -374,82 +374,3 @@ One can experience an overall throughput drop if you have created multiple
groups and put applications in that group which are not driving enough
IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
on individual groups and throughput should improve.
-
-Writeback
-=========
-
-Page cache is dirtied through buffered writes and shared mmaps and
-written asynchronously to the backing filesystem by the writeback
-mechanism. Writeback sits between the memory and IO domains and
-regulates the proportion of dirty memory by balancing dirtying and
-write IOs.
-
-On traditional cgroup hierarchies, relationships between different
-controllers cannot be established making it impossible for writeback
-to operate accounting for cgroup resource restrictions and all
-writeback IOs are attributed to the root cgroup.
-
-If both the blkio and memory controllers are used on the v2 hierarchy
-and the filesystem supports cgroup writeback, writeback operations
-correctly follow the resource restrictions imposed by both memory and
-blkio controllers.
-
-Writeback examines both system-wide and per-cgroup dirty memory status
-and enforces the more restrictive of the two. Also, writeback control
-parameters which are absolute values - vm.dirty_bytes and
-vm.dirty_background_bytes - are distributed across cgroups according
-to their current writeback bandwidth.
-
-There's a peculiarity stemming from the discrepancy in ownership
-granularity between memory controller and writeback. While memory
-controller tracks ownership per page, writeback operates on inode
-basis. cgroup writeback bridges the gap by tracking ownership by
-inode but migrating ownership if too many foreign pages, pages which
-don't match the current inode ownership, have been encountered while
-writing back the inode.
-
-This is a conscious design choice as writeback operations are
-inherently tied to inodes making strictly following page ownership
-complicated and inefficient. The only use case which suffers from
-this compromise is multiple cgroups concurrently dirtying disjoint
-regions of the same inode, which is an unlikely use case and decided
-to be unsupported. Note that as memory controller assigns page
-ownership on the first use and doesn't update it until the page is
-released, even if cgroup writeback strictly follows page ownership,
-multiple cgroups dirtying overlapping areas wouldn't work as expected.
-In general, write-sharing an inode across multiple cgroups is not well
-supported.
-
-Filesystem support for cgroup writeback
----------------------------------------
-
-A filesystem can make writeback IOs cgroup-aware by updating
-address_space_operations->writepage[s]() to annotate bio's using the
-following two functions.
-
-* wbc_init_bio(@wbc, @bio)
-
- Should be called for each bio carrying writeback data and associates
- the bio with the inode's owner cgroup. Can be called anytime
- between bio allocation and submission.
-
-* wbc_account_io(@wbc, @page, @bytes)
-
- Should be called for each data segment being written out. While
- this function doesn't care exactly when it's called during the
- writeback session, it's the easiest and most natural to call it as
- data segments are added to a bio.
-
-With writeback bio's annotated, cgroup support can be enabled per
-super_block by setting MS_CGROUPWB in ->s_flags. This allows for
-selective disabling of cgroup writeback support which is helpful when
-certain filesystem features, e.g. journaled data mode, are
-incompatible.
-
-wbc_init_bio() binds the specified bio to its cgroup. Depending on
-the configuration, the bio may be executed at a lower priority and if
-the writeback session is holding shared resources, e.g. a journal
-entry, may lead to priority inversion. There is no one easy solution
-for the problem. Filesystems can try to work around specific problem
-cases by skipping wbc_init_bio() or using bio_associate_blkcg()
-directly.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroup-legacy/cgroups.txt
index c6256ae9885b..c6256ae9885b 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroup-legacy/cgroups.txt
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroup-legacy/cpuacct.txt
index 9d73cc0cadb9..9d73cc0cadb9 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroup-legacy/cpuacct.txt
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroup-legacy/cpusets.txt
index fdf7dff3f607..fdf7dff3f607 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroup-legacy/cpusets.txt
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroup-legacy/devices.txt
index 3c1095ca02ea..3c1095ca02ea 100644
--- a/Documentation/cgroups/devices.txt
+++ b/Documentation/cgroup-legacy/devices.txt
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroup-legacy/freezer-subsystem.txt
index e831cb2b8394..e831cb2b8394 100644
--- a/Documentation/cgroups/freezer-subsystem.txt
+++ b/Documentation/cgroup-legacy/freezer-subsystem.txt
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroup-legacy/hugetlb.txt
index 106245c3aecc..106245c3aecc 100644
--- a/Documentation/cgroups/hugetlb.txt
+++ b/Documentation/cgroup-legacy/hugetlb.txt
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroup-legacy/memcg_test.txt
index 8870b0212150..8870b0212150 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroup-legacy/memcg_test.txt
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroup-legacy/memory.txt
index ff71e16cc752..ff71e16cc752 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroup-legacy/memory.txt
diff --git a/Documentation/cgroups/net_cls.txt b/Documentation/cgroup-legacy/net_cls.txt
index ec182346dea2..ec182346dea2 100644
--- a/Documentation/cgroups/net_cls.txt
+++ b/Documentation/cgroup-legacy/net_cls.txt
diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroup-legacy/net_prio.txt
index a82cbd28ea8a..a82cbd28ea8a 100644
--- a/Documentation/cgroups/net_prio.txt
+++ b/Documentation/cgroup-legacy/net_prio.txt
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroup-legacy/pids.txt
index 1a078b5d281a..1a078b5d281a 100644
--- a/Documentation/cgroups/pids.txt
+++ b/Documentation/cgroup-legacy/pids.txt
diff --git a/Documentation/cgroup.txt b/Documentation/cgroup.txt
new file mode 100644
index 000000000000..31d1f7bf12a1
--- /dev/null
+++ b/Documentation/cgroup.txt
@@ -0,0 +1,1293 @@
+
+Control Group v2
+
+October, 2015 Tejun Heo <tj@kernel.org>
+
+This is the authoritative documentation on the design, interface and
+conventions of cgroup v2. It describes all userland-visible aspects
+of cgroup including core and specific controller behaviors. All
+future changes must be reflected in this document. Documentation for
+v1 is available under Documentation/cgroup-legacy/.
+
+CONTENTS
+
+1. Introduction
+ 1-1. Terminology
+ 1-2. What is cgroup?
+2. Basic Operations
+ 2-1. Mounting
+ 2-2. Organizing Processes
+ 2-3. [Un]populated Notification
+ 2-4. Controlling Controllers
+ 2-4-1. Enabling and Disabling
+ 2-4-2. Top-down Constraint
+ 2-4-3. No Internal Process Constraint
+ 2-5. Delegation
+ 2-5-1. Model of Delegation
+ 2-5-2. Delegation Containment
+ 2-6. Guidelines
+ 2-6-1. Organize Once and Control
+ 2-6-2. Avoid Name Collisions
+3. Resource Distribution Models
+ 3-1. Weights
+ 3-2. Limits
+ 3-3. Protections
+ 3-4. Allocations
+4. Interface Files
+ 4-1. Format
+ 4-2. Conventions
+ 4-3. Core Interface Files
+5. Controllers
+ 5-1. CPU
+ 5-1-1. CPU Interface Files
+ 5-2. Memory
+ 5-2-1. Memory Interface Files
+ 5-2-2. Usage Guidelines
+ 5-2-3. Memory Ownership
+ 5-3. IO
+ 5-3-1. IO Interface Files
+ 5-3-2. Writeback
+P. Information on Kernel Programming
+ P-1. Filesystem Support for Writeback
+D. Deprecated v1 Core Features
+R. Issues with v1 and Rationales for v2
+ R-1. Multiple Hierarchies
+ R-2. Thread Granularity
+ R-3. Competition Between Inner Nodes and Threads
+ R-4. Other Interface Issues
+ R-5. Controller Issues and Remedies
+ R-5-1. Memory
+
+
+1. Introduction
+
+1-1. Terminology
+
+"cgroup" stands for "control group" and is never capitalized. The
+singular form is used to designate the whole feature and also as a
+qualifier as in "cgroup controllers". When explicitly referring to
+multiple individual control groups, the plural form "cgroups" is used.
+
+
+1-2. What is cgroup?
+
+cgroup is a mechanism to organize processes hierarchically and
+distribute system resources along the hierarchy in a controlled and
+configurable manner.
+
+cgroup is largely composed of two parts - the core and controllers.
+cgroup core is primarily responsible for hierarchically organizing
+processes. A cgroup controller is usually responsible for
+distributing a specific type of system resource along the hierarchy
+although there are utility controllers which serve purposes other than
+resource distribution.
+
+cgroups form a tree structure and every process in the system belongs
+to one and only one cgroup. All threads of a process belong to the
+same cgroup. On creation, all processes are put in the cgroup that
+the parent process belongs to at the time. A process can be migrated
+to another cgroup. Migration of a process doesn't affect already
+existing descendant processes.
+
+Following certain structural constraints, controllers may be enabled or
+disabled selectively on a cgroup. All controller behaviors are
+hierarchical - if a controller is enabled on a cgroup, it affects all
+processes which belong to the cgroups consisting the inclusive
+sub-hierarchy of the cgroup. When a controller is enabled on a nested
+cgroup, it always restricts the resource distribution further. The
+restrictions set closer to the root in the hierarchy can not be
+overridden from further away.
+
+
+2. Basic Operations
+
+2-1. Mounting
+
+Unlike v1, cgroup v2 has only single hierarchy. The cgroup v2
+hierarchy can be mounted with the following mount command.
+
+ # mount -t cgroup2 none $MOUNT_POINT
+
+cgroup2 filesystem has the magic number 0x63677270 ("cgrp"). All
+controllers which support v2 and are not bound to a v1 hierarchy are
+automatically bound to the v2 hierarchy and show up at the root.
+Controllers which are not in active use in the v2 hierarchy can be
+bound to other hierarchies. This allows mixing v2 hierarchy with the
+legacy v1 multiple hierarchies in a fully backward compatible way.
+
+A controller can be moved across hierarchies only after the controller
+is no longer referenced in its current hierarchy. Because per-cgroup
+controller states are destroyed asynchronously and controllers may
+have lingering references, a controller may not show up immediately on
+the v2 hierarchy after the final umount of the previous hierarchy.
+Similarly, a controller should be fully disabled to be moved out of
+the unified hierarchy and it may take some time for the disabled
+controller to become available for other hierarchies; furthermore, due
+to inter-controller dependencies, other controllers may need to be
+disabled too.
+
+While useful for development and manual configurations, moving
+controllers dynamically between the v2 and other hierarchies is
+strongly discouraged for production use. It is recommended to decide
+the hierarchies and controller associations before starting using the
+controllers after system boot.
+
+
+2-2. Organizing Processes
+
+Initially, only the root cgroup exists to which all processes belong.
+A child cgroup can be created by creating a sub-directory.
+
+ # mkdir $CGROUP_NAME
+
+A given cgroup may have multiple child cgroups forming a tree
+structure. Each cgroup has a read-writable interface file
+"cgroup.procs". When read, it lists the PIDs of all processes which
+belong to the cgroup one-per-line. The PIDs are not ordered and the
+same PID may show up more than once if the process got moved to
+another cgroup and then back or the PID got recycled while reading.
+
+A process can be migrated into a cgroup by writing its PID to the
+target cgroup's "cgroup.procs" file. Only one process can be migrated
+on a single write(2) call. If a process is composed of multiple
+threads, writing the PID of any thread migrates all threads of the
+process.
+
+When a process forks a child process, the new process is born into the
+cgroup that the forking process belongs to at the time of the
+operation. After exit, a process stays associated with the cgroup
+that it belonged to at the time of exit until it's reaped; however, a
+zombie process does not appear in "cgroup.procs" and thus can't be
+moved to another cgroup.
+
+A cgroup which doesn't have any children or live processes can be
+destroyed by removing the directory. Note that a cgroup which doesn't
+have any children and is associated only with zombie processes is
+considered empty and can be removed.
+
+ # rmdir $CGROUP_NAME
+
+"/proc/$PID/cgroup" lists a process's cgroup membership. If legacy
+cgroup is in use in the system, this file may contain multiple lines,
+one for each hierarchy. The entry for cgroup v2 is always in the
+format "0::$PATH".
+
+ # cat /proc/842/cgroup
+ ...
+ 0::/test-cgroup/test-cgroup-nested
+
+If the process becomes a zombie and the cgroup it was associated with
+is removed subsequently, " (deleted)" is appended to the path.
+
+ # cat /proc/842/cgroup
+ ...
+ 0::/test-cgroup/test-cgroup-nested (deleted)
+
+
+2-3. [Un]populated Notification
+
+Each non-root cgroup has a "cgroup.events" file which contains
+"populated" field indicating whether the cgroup's sub-hierarchy has
+live processes in it. Its value is 0 if there is no live process in
+the cgroup and its descendants; otherwise, 1. poll and [id]notify
+events are triggered when the value changes. This can be used, for
+example, to start a clean-up operation after all processes of a given
+sub-hierarchy have exited. The populated state updates and
+notifications are recursive. Consider the following sub-hierarchy
+where the numbers in the parentheses represent the numbers of processes
+in each cgroup.
+
+ A(4) - B(0) - C(1)
+ \ D(0)
+
+A, B and C's "populated" fields would be 1 while D's 0. After the one
+process in C exits, B and C's "populated" fields would flip to "0" and
+file modified events will be generated on the "cgroup.events" files of
+both cgroups.
+
+
+2-4. Controlling Controllers
+
+2-4-1. Enabling and Disabling
+
+Each cgroup has a "cgroup.controllers" file which lists all
+controllers available for the cgroup to enable.
+
+ # cat cgroup.controllers
+ cpu io memory
+
+No controller is enabled by default. Controllers can be enabled and
+disabled by writing to the "cgroup.subtree_control" file.
+
+ # echo "+cpu +memory -io" > cgroup.subtree_control
+
+Only controllers which are listed in "cgroup.controllers" can be
+enabled. When multiple operations are specified as above, either they
+all succeed or fail. If multiple operations on the same controller
+are specified, the last one is effective.
+
+Enabling a controller in a cgroup indicates that the distribution of
+the target resource across its immediate children will be controlled.
+Consider the following sub-hierarchy. The enabled controllers are
+listed in parentheses.
+
+ A(cpu,memory) - B(memory) - C()
+ \ D()
+
+As A has "cpu" and "memory" enabled, A will control the distribution
+of CPU cycles and memory to its children, in this case, B. As B has
+"memory" enabled but not "CPU", C and D will compete freely on CPU
+cycles but their division of memory available to B will be controlled.
+
+As a controller regulates the distribution of the target resource to
+the cgroup's children, enabling it creates the controller's interface
+files in the child cgroups. In the above example, enabling "cpu" on B
+would create the "cpu." prefixed controller interface files in C and
+D. Likewise, disabling "memory" from B would remove the "memory."
+prefixed controller interface files from C and D. This means that the
+controller interface files - anything which doesn't start with
+"cgroup." are owned by the parent rather than the cgroup itself.
+
+
+2-4-2. Top-down Constraint
+
+Resources are distributed top-down and a cgroup can further distribute
+a resource only if the resource has been distributed to it from the
+parent. This means that all non-root "cgroup.subtree_control" files
+can only contain controllers which are enabled in the parent's
+"cgroup.subtree_control" file. A controller can be enabled only if
+the parent has the controller enabled and a controller can't be
+disabled if one or more children have it enabled.
+
+
+2-4-3. No Internal Process Constraint
+
+Non-root cgroups can only distribute resources to their children when
+they don't have any processes of their own. In other words, only
+cgroups which don't contain any processes can have controllers enabled
+in their "cgroup.subtree_control" files.
+
+This guarantees that, when a controller is looking at the part of the
+hierarchy which has it enabled, processes are always only on the
+leaves. This rules out situations where child cgroups compete against
+internal processes of the parent.
+
+The root cgroup is exempt from this restriction. Root contains
+processes and anonymous resource consumption which can't be associated
+with any other cgroups and requires special treatment from most
+controllers. How resource consumption in the root cgroup is governed
+is up to each controller.
+
+Note that the restriction doesn't get in the way if there is no
+enabled controller in the cgroup's "cgroup.subtree_control". This is
+important as otherwise it wouldn't be possible to create children of a
+populated cgroup. To control resource distribution of a cgroup, the
+cgroup must create children and transfer all its processes to the
+children before enabling controllers in its "cgroup.subtree_control"
+file.
+
+
+2-5. Delegation
+
+2-5-1. Model of Delegation
+
+A cgroup can be delegated to a less privileged user by granting write
+access of the directory and its "cgroup.procs" file to the user. Note
+that resource control interface files in a given directory control the
+distribution of the parent's resources and thus must not be delegated
+along with the directory.
+
+Once delegated, the user can build sub-hierarchy under the directory,
+organize processes as it sees fit and further distribute the resources
+it received from the parent. The limits and other settings of all
+resource controllers are hierarchical and regardless of what happens
+in the delegated sub-hierarchy, nothing can escape the resource
+restrictions imposed by the parent.
+
+Currently, cgroup doesn't impose any restrictions on the number of
+cgroups in or nesting depth of a delegated sub-hierarchy; however,
+this may be limited explicitly in the future.
+
+
+2-5-2. Delegation Containment
+
+A delegated sub-hierarchy is contained in the sense that processes
+can't be moved into or out of the sub-hierarchy by the delegatee. For
+a process with a non-root euid to migrate a target process into a
+cgroup by writing its PID to the "cgroup.procs" file, the following
+conditions must be met.
+
+- The writer's euid must match either uid or suid of the target process.
+
+- The writer must have write access to the "cgroup.procs" file.
+
+- The writer must have write access to the "cgroup.procs" file of the
+ common ancestor of the source and destination cgroups.
+
+The above three constraints ensure that while a delegatee may migrate
+processes around freely in the delegated sub-hierarchy it can't pull
+in from or push out to outside the sub-hierarchy.
+
+For an example, let's assume cgroups C0 and C1 have been delegated to
+user U0 who created C00, C01 under C0 and C10 under C1 as follows and
+all processes under C0 and C1 belong to U0.
+
+ ~~~~~~~~~~~~~ - C0 - C00
+ ~ cgroup ~ \ C01
+ ~ hierarchy ~
+ ~~~~~~~~~~~~~ - C1 - C10
+
+Let's also say U0 wants to write the PID of a process which is
+currently in C10 into "C00/cgroup.procs". U0 has write access to the
+file and uid match on the process; however, the common ancestor of the
+source cgroup C10 and the destination cgroup C00 is above the points
+of delegation and U0 would not have write access to its "cgroup.procs"
+files and thus the write will be denied with -EACCES.
+
+
+2-6. Guidelines
+
+2-6-1. Organize Once and Control
+
+Migrating a process across cgroups is a relatively expensive operation
+and stateful resources such as memory are not moved together with the
+process. This is an explicit design decision as there often exist
+inherent trade-offs between migration and various hot paths in terms
+of synchronization cost.
+
+As such, migrating processes across cgroups frequently as a means to
+apply different resource restrictions is discouraged. A workload
+should be assigned to a cgroup according to the system's logical and
+resource structure once on start-up. Dynamic adjustments to resource
+distribution can be made by changing controller configuration through
+the interface files.
+
+
+2-6-2. Avoid Name Collisions
+
+Interface files for a cgroup and its children cgroups occupy the same
+directory and it is possible to create children cgroups which collide
+with interface files.
+
+All cgroup core interface files are prefixed with "cgroup." and each
+controller's interface files are prefixed with the controller name and
+a dot. A controller's name is composed of lower case alphabets and
+'_'s but never begins with an '_' so it can be used as the prefix
+character for collision avoidance. Also, interface file names won't
+start or end with terms which are often used in categorizing workloads
+such as job, service, slice, unit or workload.
+
+cgroup doesn't do anything to prevent name collisions and it's the
+user's responsibility to avoid them.
+
+
+3. Resource Distribution Models
+
+cgroup controllers implement several resource distribution schemes
+depending on the resource type and expected use cases. This section
+describes major schemes in use along with their expected behaviors.
+
+
+3-1. Weights
+
+A parent's resource is distributed by adding up the weights of all
+active children and giving each the fraction matching the ratio of its
+weight against the sum. As only children which can make use of the
+resource at the moment participate in the distribution, this is
+work-conserving. Due to the dynamic nature, this model is usually
+used for stateless resources.
+
+All weights are in the range [1, 10000] with the default at 100. This
+allows symmetric multiplicative biases in both directions at fine
+enough granularity while staying in the intuitive range.
+
+As long as the weight is in range, all configuration combinations are
+valid and there is no reason to reject configuration changes or
+process migrations.
+
+"cpu.weight" proportionally distributes CPU cycles to active children
+and is an example of this type.
+
+
+3-2. Limits
+
+A child can only consume upto the configured amount of the resource.
+Limits can be over-committed - the sum of the limits of children can
+exceed the amount of resource available to the parent.
+
+Limits are in the range [0, max] and defaults to "max", which is noop.
+
+As limits can be over-committed, all configuration combinations are
+valid and there is no reason to reject configuration changes or
+process migrations.
+
+"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
+on an IO device and is an example of this type.
+
+
+3-3. Protections
+
+A cgroup is protected to be allocated upto the configured amount of
+the resource if the usages of all its ancestors are under their
+protected levels. Protections can be hard guarantees or best effort
+soft boundaries. Protections can also be over-committed in which case
+only upto the amount available to the parent is protected among
+children.
+
+Protections are in the range [0, max] and defaults to 0, which is
+noop.
+
+As protections can be over-committed, all configuration combinations
+are valid and there is no reason to reject configuration changes or
+process migrations.
+
+"memory.low" implements best-effort memory protection and is an
+example of this type.
+
+
+3-4. Allocations
+
+A cgroup is exclusively allocated a certain amount of a finite
+resource. Allocations can't be over-committed - the sum of the
+allocations of children can not exceed the amount of resource
+available to the parent.
+
+Allocations are in the range [0, max] and defaults to 0, which is no
+resource.
+
+As allocations can't be over-committed, some configuration
+combinations are invalid and should be rejected. Also, if the
+resource is mandatory for execution of processes, process migrations
+may be rejected.
+
+"cpu.rt.max" hard-allocates realtime slices and is an example of this
+type.
+
+
+4. Interface Files
+
+4-1. Format
+
+All interface files should be in one of the following formats whenever
+possible.
+
+ New-line separated values
+ (when only one value can be written at once)
+
+ VAL0\n
+ VAL1\n
+ ...
+
+ Space separated values
+ (when read-only or multiple values can be written at once)
+
+ VAL0 VAL1 ...\n
+
+ Flat keyed
+
+ KEY0 VAL0\n
+ KEY1 VAL1\n
+ ...
+
+ Nested keyed
+
+ KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
+ KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
+ ...
+
+For a writable file, the format for writing should generally match
+reading; however, controllers may allow omitting later fields or
+implement restricted shortcuts for most common use cases.
+
+For both flat and nested keyed files, only the values for a single key
+can be written at a time. For nested keyed files, the sub key pairs
+may be specified in any order and not all pairs have to be specified.
+
+
+4-2. Conventions
+
+- Settings for a single feature should be contained in a single file.
+
+- The root cgroup should be exempt from resource control and thus
+ shouldn't have resource control interface files. Also,
+ informational files on the root cgroup which end up showing global
+ information available elsewhere shouldn't exist.
+
+- If a controller implements weight based resource distribution, its
+ interface file should be named "weight" and have the range [1,
+ 10000] with 100 as the default. The values are chosen to allow
+ enough and symmetric bias in both directions while keeping it
+ intuitive (the default is 100%).
+
+- If a controller implements an absolute resource guarantee and/or
+ limit, the interface files should be named "min" and "max"
+ respectively. If a controller implements best effort resource
+ guarantee and/or limit, the interface files should be named "low"
+ and "high" respectively.
+
+ In the above four control files, the special token "max" should be
+ used to represent upward infinity for both reading and writing.
+
+- If a setting has a configurable default value and keyed specific
+ overrides, the default entry should be keyed with "default" and
+ appear as the first entry in the file.
+
+ The default value can be updated by writing either "default $VAL" or
+ "$VAL".
+
+ When writing to update a specific override, "default" can be used as
+ the value to indicate removal of the override. Override entries
+ with "default" as the value must not appear when read.
+
+ For example, a setting which is keyed by major:minor device numbers
+ with integer values may look like the following.
+
+ # cat cgroup-example-interface-file
+ default 150
+ 8:0 300
+
+ The default value can be updated by
+
+ # echo 125 > cgroup-example-interface-file
+
+ or
+
+ # echo "default 125" > cgroup-example-interface-file
+
+ An override can be set by
+
+ # echo "8:16 170" > cgroup-example-interface-file
+
+ and cleared by
+
+ # echo "8:0 default" > cgroup-example-interface-file
+ # cat cgroup-example-interface-file
+ default 125
+ 8:16 170
+
+- For events which are not very high frequency, an interface file
+ "events" should be created which lists event key value pairs.
+ Whenever a notifiable event happens, file modified event should be
+ generated on the file.
+
+
+4-3. Core Interface Files
+
+All cgroup core files are prefixed with "cgroup."
+
+ cgroup.procs
+
+ A read-write new-line separated values file which exists on
+ all cgroups.
+
+ When read, it lists the PIDs of all processes which belong to
+ the cgroup one-per-line. The PIDs are not ordered and the
+ same PID may show up more than once if the process got moved
+ to another cgroup and then back or the PID got recycled while
+ reading.
+
+ A PID can be written to migrate the process associated with
+ the PID to the cgroup. The writer should match all of the
+ following conditions.
+
+ - Its euid is either root or must match either uid or suid of
+ the target process.
+
+ - It must have write access to the "cgroup.procs" file.
+
+ - It must have write access to the "cgroup.procs" file of the
+ common ancestor of the source and destination cgroups.
+
+ When delegating a sub-hierarchy, write access to this file
+ should be granted along with the containing directory.
+
+ cgroup.controllers
+
+ A read-only space separated values file which exists on all
+ cgroups.
+
+ It shows space separated list of all controllers available to
+ the cgroup. The controllers are not ordered.
+
+ cgroup.subtree_control
+
+ A read-write space separated values file which exists on all
+ cgroups. Starts out empty.
+
+ When read, it shows space separated list of the controllers
+ which are enabled to control resource distribution from the
+ cgroup to its children.
+
+ Space separated list of controllers prefixed with '+' or '-'
+ can be written to enable or disable controllers. A controller
+ name prefixed with '+' enables the controller and '-'
+ disables. If a controller appears more than once on the list,
+ the last one is effective. When multiple enable and disable
+ operations are specified, either all succeed or all fail.
+
+ cgroup.events
+
+ A read-only flat-keyed file which exists on non-root cgroups.
+ The following entries are defined. Unless specified
+ otherwise, a value change in this file generates a file
+ modified event.
+
+ populated
+
+ 1 if the cgroup or its descendants contains any live
+ processes; otherwise, 0.
+
+
+5. Controllers
+
+5-1. CPU
+
+[NOTE: The interface for the cpu controller hasn't been merged yet]
+
+The "cpu" controllers regulates distribution of CPU cycles. This
+controller implements weight and absolute bandwidth limit models for
+normal scheduling policy and absolute bandwidth allocation model for
+realtime scheduling policy.
+
+
+5-1-1. CPU Interface Files
+
+All time durations are in microseconds.
+
+ cpu.stat
+
+ A read-only flat-keyed file which exists on non-root cgroups.
+
+ It reports the following six stats.
+
+ usage_usec
+ user_usec
+ system_usec
+ nr_periods
+ nr_throttled
+ throttled_usec
+
+ cpu.weight
+
+ A read-write single value file which exists on non-root
+ cgroups. The default is "100".
+
+ The weight in the range [1, 10000].
+
+ cpu.max
+
+ A read-write two value file which exists on non-root cgroups.
+ The default is "max 100000".
+
+ The maximum bandwidth limit. It's in the following format.
+
+ $MAX $PERIOD
+
+ which indicates that the group may consume upto $MAX in each
+ $PERIOD duration. "max" for $MAX indicates no limit. If only
+ one number is written, $MAX is updated.
+
+ cpu.rt.max
+
+ [NOTE: The semantics of this file is still under discussion and the
+ interface hasn't been merged yet]
+
+ A read-write two value file which exists on all cgroups.
+ The default is "0 100000".
+
+ The maximum realtime runtime allocation. Over-committing
+ configurations are disallowed and process migrations are
+ rejected if not enough bandwidth is available. It's in the
+ following format.
+
+ $MAX $PERIOD
+
+ which indicates that the group may consume upto $MAX in each
+ $PERIOD duration. If only one number is written, $MAX is
+ updated.
+
+
+5-2. Memory
+
+The "memory" controller regulates distribution of memory. Memory is
+stateful and implements both limit and protection models. Due to the
+intertwining between memory usage and reclaim pressure and the
+stateful nature of memory, the distribution model is relatively
+complex.
+
+While not completely water-tight, all major memory usages by a given
+cgroup are tracked so that the total memory consumption can be
+accounted and controlled to a reasonable extent. Currently, the
+following types of memory usages are tracked.
+
+- Userland memory - page cache and anonymous memory.
+
+- Kernel data structures such as dentries and inodes.
+
+- TCP socket buffers.
+
+The above list may expand in the future for better coverage.
+
+
+5-2-1. Memory Interface Files
+
+All memory amounts are in bytes. If a value which is not aligned to
+PAGE_SIZE is written, the value may be rounded up to the closest
+PAGE_SIZE multiple when read back.
+
+ memory.current
+
+ A read-only single value file which exists on non-root
+ cgroups.
+
+ The total amount of memory currently being used by the cgroup
+ and its descendants.
+
+ memory.low
+
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ Best-effort memory protection. If the memory usages of a
+ cgroup and all its ancestors are below their low boundaries,
+ the cgroup's memory won't be reclaimed unless memory can be
+ reclaimed from unprotected cgroups.
+
+ Putting more memory than generally available under this
+ protection is discouraged.
+
+ memory.high
+
+ A read-write single value file which exists on non-root
+ cgroups. The default is "max".
+
+ Memory usage throttle limit. This is the main mechanism to
+ control memory usage of a cgroup. If a cgroup's usage goes
+ over the high boundary, the processes of the cgroup are
+ throttled and put under heavy reclaim pressure.
+
+ Going over the high limit never invokes the OOM killer and
+ under extreme conditions the limit may be breached.
+
+ memory.max
+
+ A read-write single value file which exists on non-root
+ cgroups. The default is "max".
+
+ Memory usage hard limit. This is the final protection
+ mechanism. If a cgroup's memory usage reaches this limit and
+ can't be reduced, the OOM killer is invoked in the cgroup.
+ Under certain circumstances, the usage may go over the limit
+ temporarily.
+
+ This is the ultimate protection mechanism. As long as the
+ high limit is used and monitored properly, this limit's
+ utility is limited to providing the final safety net.
+
+ memory.events
+
+ A read-only flat-keyed file which exists on non-root cgroups.
+ The following entries are defined. Unless specified
+ otherwise, a value change in this file generates a file
+ modified event.
+
+ low
+
+ The number of times the cgroup is reclaimed due to
+ high memory pressure even though its usage is under
+ the low boundary. This usually indicates that the low
+ boundary is over-committed.
+
+ high
+
+ The number of times processes of the cgroup are
+ throttled and routed to perform direct memory reclaim
+ because the high memory boundary was exceeded. For a
+ cgroup whose memory usage is capped by the high limit
+ rather than global memory pressure, this event's
+ occurrences are expected.
+
+ max
+
+ The number of times the cgroup's memory usage was
+ about to go over the max boundary. If direct reclaim
+ fails to bring it down, the OOM killer is invoked.
+
+ oom
+
+ The number of times the OOM killer has been invoked in
+ the cgroup. This may not exactly match the number of
+ processes killed but should generally be close.
+
+
+5-2-2. General Usage
+
+"memory.high" is the main mechanism to control memory usage.
+Over-committing on high limit (sum of high limits > available memory)
+and letting global memory pressure to distribute memory according to
+usage is a viable strategy.
+
+Because breach of the high limit doesn't trigger the OOM killer but
+throttles the offending cgroup, a management agent has ample
+opportunities to monitor and take appropriate actions such as granting
+more memory or terminating the workload.
+
+Determining whether a cgroup has enough memory is not trivial as
+memory usage doesn't indicate whether the workload can benefit from
+more memory. For example, a workload which writes data received from
+network to a file can use all available memory but can also operate as
+performant with a small amount of memory. A measure of memory
+pressure - how much the workload is being impacted due to lack of
+memory - is necessary to determine whether a workload needs more
+memory; unfortunately, memory pressure monitoring mechanism isn't
+implemented yet.
+
+
+5-2-3. Memory Ownership
+
+A memory area is charged to the cgroup which instantiated it and stays
+charged to the cgroup until the area is released. Migrating a process
+to a different cgroup doesn't move the memory usages that it
+instantiated while in the previous cgroup to the new cgroup.
+
+A memory area may be used by processes belonging to different cgroups.
+To which cgroup the area will be charged is in-deterministic; however,
+over time, the memory area is likely to end up in a cgroup which has
+enough memory allowance to avoid high reclaim pressure.
+
+If a cgroup sweeps a considerable amount of memory which is expected
+to be accessed repeatedly by other cgroups, it may make sense to use
+POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
+belonging to the affected files to ensure correct memory ownership.
+
+
+5-3. IO
+
+The "io" controller regulates the distribution of IO resources. This
+controller implements both weight based and absolute bandwidth or IOPS
+limit distribution; however, weight based distribution is available
+only if cfq-iosched is in use and neither scheme is available for
+blk-mq devices.
+
+
+5-3-1. IO Interface Files
+
+ io.stat
+
+ A read-only nested-keyed file which exists on non-root
+ cgroups.
+
+ Lines are keyed by $MAJ:$MIN device numbers and not ordered.
+ The following nested keys are defined.
+
+ rbytes Bytes read
+ wbytes Bytes written
+ rios Number of read IOs
+ wios Number of write IOs
+
+ An example read output follows.
+
+ 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
+ 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
+
+ io.weight
+
+ A read-write flat-keyed file which exists on non-root cgroups.
+ The default is "default 100".
+
+ The first line is the default weight applied to devices
+ without specific override. The rest are overrides keyed by
+ $MAJ:$MIN device numbers and not ordered. The weights are in
+ the range [1, 10000] and specifies the relative amount IO time
+ the cgroup can use in relation to its siblings.
+
+ The default weight can be updated by writing either "default
+ $WEIGHT" or simply "$WEIGHT". Overrides can be set by writing
+ "$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default".
+
+ An example read output follows.
+
+ default 100
+ 8:16 200
+ 8:0 50
+
+ io.max
+
+ A read-write nested-keyed file which exists on non-root
+ cgroups.
+
+ BPS and IOPS based IO limit. Lines are keyed by $MAJ:$MIN
+ device numbers and not ordered. The following nested keys are
+ defined.
+
+ rbps Max read bytes per second
+ wbps Max write bytes per second
+ riops Max read IO operations per second
+ wiops Max write IO operations per second
+
+ When writing, any number of nested key-value pairs can be
+ specified in any order. "max" can be specified as the value
+ to remove a specific limit. If the same key is specified
+ multiple times, the outcome is undefined.
+
+ BPS and IOPS are measured in each IO direction and IOs are
+ delayed if limit is reached. Temporary bursts are allowed.
+
+ Setting read limit at 2M BPS and write at 120 IOPS for 8:16.
+
+ echo "8:16 rbps=2097152 wiops=120" > io.max
+
+ Reading returns the following.
+
+ 8:16 rbps=2097152 wbps=max riops=max wiops=120
+
+ Write IOPS limit can be removed by writing the following.
+
+ echo "8:16 wiops=max" > io.max
+
+ Reading now returns the following.
+
+ 8:16 rbps=2097152 wbps=max riops=max wiops=max
+
+
+5-3-2. Writeback
+
+Page cache is dirtied through buffered writes and shared mmaps and
+written asynchronously to the backing filesystem by the writeback
+mechanism. Writeback sits between the memory and IO domains and
+regulates the proportion of dirty memory by balancing dirtying and
+write IOs.
+
+The io controller, in conjunction with the memory controller,
+implements control of page cache writeback IOs. The memory controller
+defines the memory domain that dirty memory ratio is calculated and
+maintained for and the io controller defines the io domain which
+writes out dirty pages for the memory domain. Both system-wide and
+per-cgroup dirty memory states are examined and the more restrictive
+of the two is enforced.
+
+cgroup writeback requires explicit support from the underlying
+filesystem. Currently, cgroup writeback is implemented on ext2, ext4
+and btrfs. On other filesystems, all writeback IOs are attributed to
+the root cgroup.
+
+There are inherent differences in memory and writeback management
+which affects how cgroup ownership is tracked. Memory is tracked per
+page while writeback per inode. For the purpose of writeback, an
+inode is assigned to a cgroup and all IO requests to write dirty pages
+from the inode are attributed to that cgroup.
+
+As cgroup ownership for memory is tracked per page, there can be pages
+which are associated with different cgroups than the one the inode is
+associated with. These are called foreign pages. The writeback
+constantly keeps track of foreign pages and, if a particular foreign
+cgroup becomes the majority over a certain period of time, switches
+the ownership of the inode to that cgroup.
+
+While this model is enough for most use cases where a given inode is
+mostly dirtied by a single cgroup even when the main writing cgroup
+changes over time, use cases where multiple cgroups write to a single
+inode simultaneously are not supported well. In such circumstances, a
+significant portion of IOs are likely to be attributed incorrectly.
+As memory controller assigns page ownership on the first use and
+doesn't update it until the page is released, even if writeback
+strictly follows page ownership, multiple cgroups dirtying overlapping
+areas wouldn't work as expected. It's recommended to avoid such usage
+patterns.
+
+The sysctl knobs which affect writeback behavior are applied to cgroup
+writeback as follows.
+
+ vm.dirty_background_ratio
+ vm.dirty_ratio
+
+ These ratios apply the same to cgroup writeback with the
+ amount of available memory capped by limits imposed by the
+ memory controller and system-wide clean memory.
+
+ vm.dirty_background_bytes
+ vm.dirty_bytes
+
+ For cgroup writeback, this is calculated into ratio against
+ total available memory and applied the same way as
+ vm.dirty[_background]_ratio.
+
+
+P. Information on Kernel Programming
+
+This section contains kernel programming information in the areas
+where interacting with cgroup is necessary. cgroup core and
+controllers are not covered.
+
+
+P-1. Filesystem Support for Writeback
+
+A filesystem can support cgroup writeback by updating
+address_space_operations->writepage[s]() to annotate bio's using the
+following two functions.
+
+ wbc_init_bio(@wbc, @bio)
+
+ Should be called for each bio carrying writeback data and
+ associates the bio with the inode's owner cgroup. Can be
+ called anytime between bio allocation and submission.
+
+ wbc_account_io(@wbc, @page, @bytes)
+
+ Should be called for each data segment being written out.
+ While this function doesn't care exactly when it's called
+ during the writeback session, it's the easiest and most
+ natural to call it as data segments are added to a bio.
+
+With writeback bio's annotated, cgroup support can be enabled per
+super_block by setting SB_I_CGROUPWB in ->s_iflags. This allows for
+selective disabling of cgroup writeback support which is helpful when
+certain filesystem features, e.g. journaled data mode, are
+incompatible.
+
+wbc_init_bio() binds the specified bio to its cgroup. Depending on
+the configuration, the bio may be executed at a lower priority and if
+the writeback session is holding shared resources, e.g. a journal
+entry, may lead to priority inversion. There is no one easy solution
+for the problem. Filesystems can try to work around specific problem
+cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+directly.
+
+
+D. Deprecated v1 Core Features
+
+- Multiple hierarchies including named ones are not supported.
+
+- All mount options and remounting are not supported.
+
+- The "tasks" file is removed and "cgroup.procs" is not sorted.
+
+- "cgroup.clone_children" is removed.
+
+- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file
+ at the root instead.
+
+
+R. Issues with v1 and Rationales for v2
+
+R-1. Multiple Hierarchies
+
+cgroup v1 allowed an arbitrary number of hierarchies and each
+hierarchy could host any number of controllers. While this seemed to
+provide a high level of flexibility, it wasn't useful in practice.
+
+For example, as there is only one instance of each controller, utility
+type controllers such as freezer which can be useful in all
+hierarchies could only be used in one. The issue is exacerbated by
+the fact that controllers couldn't be moved to another hierarchy once
+hierarchies were populated. Another issue was that all controllers
+bound to a hierarchy were forced to have exactly the same view of the
+hierarchy. It wasn't possible to vary the granularity depending on
+the specific controller.
+
+In practice, these issues heavily limited which controllers could be
+put on the same hierarchy and most configurations resorted to putting
+each controller on its own hierarchy. Only closely related ones, such
+as the cpu and cpuacct controllers, made sense to be put on the same
+hierarchy. This often meant that userland ended up managing multiple
+similar hierarchies repeating the same steps on each hierarchy
+whenever a hierarchy management operation was necessary.
+
+Furthermore, support for multiple hierarchies came at a steep cost.
+It greatly complicated cgroup core implementation but more importantly
+the support for multiple hierarchies restricted how cgroup could be
+used in general and what controllers was able to do.
+
+There was no limit on how many hierarchies there might be, which meant
+that a thread's cgroup membership couldn't be described in finite
+length. The key might contain any number of entries and was unlimited
+in length, which made it highly awkward to manipulate and led to
+addition of controllers which existed only to identify membership,
+which in turn exacerbated the original problem of proliferating number
+of hierarchies.
+
+Also, as a controller couldn't have any expectation regarding the
+topologies of hierarchies other controllers might be on, each
+controller had to assume that all other controllers were attached to
+completely orthogonal hierarchies. This made it impossible, or at
+least very cumbersome, for controllers to cooperate with each other.
+
+In most use cases, putting controllers on hierarchies which are
+completely orthogonal to each other isn't necessary. What usually is
+called for is the ability to have differing levels of granularity
+depending on the specific controller. In other words, hierarchy may
+be collapsed from leaf towards root when viewed from specific
+controllers. For example, a given configuration might not care about
+how memory is distributed beyond a certain level while still wanting
+to control how CPU cycles are distributed.
+
+
+R-2. Thread Granularity
+
+cgroup v1 allowed threads of a process to belong to different cgroups.
+This didn't make sense for some controllers and those controllers
+ended up implementing different ways to ignore such situations but
+much more importantly it blurred the line between API exposed to
+individual applications and system management interface.
+
+Generally, in-process knowledge is available only to the process
+itself; thus, unlike service-level organization of processes,
+categorizing threads of a process requires active participation from
+the application which owns the target process.
+
+cgroup v1 had an ambiguously defined delegation model which got abused
+in combination with thread granularity. cgroups were delegated to
+individual applications so that they can create and manage their own
+sub-hierarchies and control resource distributions along them. This
+effectively raised cgroup to the status of a syscall-like API exposed
+to lay programs.
+
+First of all, cgroup has a fundamentally inadequate interface to be
+exposed this way. For a process to access its own knobs, it has to
+extract the path on the target hierarchy from /proc/self/cgroup,
+construct the path by appending the name of the knob to the path, open
+and then read and/or write to it. This is not only extremely clunky
+and unusual but also inherently racy. There is no conventional way to
+define transaction across the required steps and nothing can guarantee
+that the process would actually be operating on its own sub-hierarchy.
+
+cgroup controllers implemented a number of knobs which would never be
+accepted as public APIs because they were just adding control knobs to
+system-management pseudo filesystem. cgroup ended up with interface
+knobs which were not properly abstracted or refined and directly
+revealed kernel internal details. These knobs got exposed to
+individual applications through the ill-defined delegation mechanism
+effectively abusing cgroup as a shortcut to implementing public APIs
+without going through the required scrutiny.
+
+This was painful for both userland and kernel. Userland ended up with
+misbehaving and poorly abstracted interfaces and kernel exposing and
+locked into constructs inadvertently.
+
+
+R-3. Competition Between Inner Nodes and Threads
+
+cgroup v1 allowed threads to be in any cgroups which created an
+interesting problem where threads belonging to a parent cgroup and its
+children cgroups competed for resources. This was nasty as two
+different types of entities competed and there was no obvious way to
+settle it. Different controllers did different things.
+
+The cpu controller considered threads and cgroups as equivalents and
+mapped nice levels to cgroup weights. This worked for some cases but
+fell flat when children wanted to be allocated specific ratios of CPU
+cycles and the number of internal threads fluctuated - the ratios
+constantly changed as the number of competing entities fluctuated.
+There also were other issues. The mapping from nice level to weight
+wasn't obvious or universal, and there were various other knobs which
+simply weren't available for threads.
+
+The io controller implicitly created a hidden leaf node for each
+cgroup to host the threads. The hidden leaf had its own copies of all
+the knobs with "leaf_" prefixed. While this allowed equivalent
+control over internal threads, it was with serious drawbacks. It
+always added an extra layer of nesting which wouldn't be necessary
+otherwise, made the interface messy and significantly complicated the
+implementation.
+
+The memory controller didn't have a way to control what happened
+between internal tasks and child cgroups and the behavior was not
+clearly defined. There were attempts to add ad-hoc behaviors and
+knobs to tailor the behavior to specific workloads which would have
+led to problems extremely difficult to resolve in the long term.
+
+Multiple controllers struggled with internal tasks and came up with
+different ways to deal with it; unfortunately, all the approaches were
+severely flawed and, furthermore, the widely different behaviors
+made cgroup as a whole highly inconsistent.
+
+This clearly is a problem which needs to be addressed from cgroup core
+in a uniform way.
+
+
+R-4. Other Interface Issues
+
+cgroup v1 grew without oversight and developed a large number of
+idiosyncrasies and inconsistencies. One issue on the cgroup core side
+was how an empty cgroup was notified - a userland helper binary was
+forked and executed for each event. The event delivery wasn't
+recursive or delegatable. The limitations of the mechanism also led
+to in-kernel event delivery filtering mechanism further complicating
+the interface.
+
+Controller interfaces were problematic too. An extreme example is
+controllers completely ignoring hierarchical organization and treating
+all cgroups as if they were all located directly under the root
+cgroup. Some controllers exposed a large amount of inconsistent
+implementation details to userland.
+
+There also was no consistency across controllers. When a new cgroup
+was created, some controllers defaulted to not imposing extra
+restrictions while others disallowed any resource usage until
+explicitly configured. Configuration knobs for the same type of
+control used widely differing naming schemes and formats. Statistics
+and information knobs were named arbitrarily and used different
+formats and units even in the same controller.
+
+cgroup v2 establishes common conventions where appropriate and updates
+controllers so that they expose minimal and consistent interfaces.
+
+
+R-5. Controller Issues and Remedies
+
+R-5-1. Memory
+
+The original lower boundary, the soft limit, is defined as a limit
+that is per default unset. As a result, the set of cgroups that
+global reclaim prefers is opt-in, rather than opt-out. The costs for
+optimizing these mostly negative lookups are so high that the
+implementation, despite its enormous size, does not even provide the
+basic desirable behavior. First off, the soft limit has no
+hierarchical meaning. All configured groups are organized in a global
+rbtree and treated like equal peers, regardless where they are located
+in the hierarchy. This makes subtree delegation impossible. Second,
+the soft limit reclaim pass is so aggressive that it not just
+introduces high allocation latencies into the system, but also impacts
+system performance due to overreclaim, to the point where the feature
+becomes self-defeating.
+
+The memory.low boundary on the other hand is a top-down allocated
+reserve. A cgroup enjoys reclaim protection when it and all its
+ancestors are below their low boundaries, which makes delegation of
+subtrees possible. Secondly, new cgroups have no reserve per default
+and in the common case most cgroups are eligible for the preferred
+reclaim pass. This allows the new low boundary to be efficiently
+implemented with just a minor addition to the generic reclaim code,
+without the need for out-of-band data structures and reclaim passes.
+Because the generic reclaim code considers all cgroups except for the
+ones running low in the preferred first reclaim pass, overreclaim of
+individual groups is eliminated as well, resulting in much better
+overall workload performance.
+
+The original high boundary, the hard limit, is defined as a strict
+limit that can not budge, even if the OOM killer has to be called.
+But this generally goes against the goal of making the most out of the
+available memory. The memory consumption of workloads varies during
+runtime, and that requires users to overcommit. But doing that with a
+strict upper limit requires either a fairly accurate prediction of the
+working set size or adding slack to the limit. Since working set size
+estimation is hard and error prone, and getting it wrong results in
+OOM kills, most users tend to err on the side of a looser limit and
+end up wasting precious resources.
+
+The memory.high boundary on the other hand can be set much more
+conservatively. When hit, it throttles allocations by forcing them
+into direct reclaim to work off the excess, but it never invokes the
+OOM killer. As a result, a high boundary that is chosen too
+aggressively will not terminate the processes, but instead it will
+lead to gradual performance degradation. The user can monitor this
+and make corrections until the minimal memory footprint that still
+gives acceptable performance is found.
+
+In extreme cases, with many concurrent allocations and a complete
+breakdown of reclaim progress within the group, the high boundary can
+be exceeded. But even then it's mostly better to satisfy the
+allocation from the slack available in other groups or the rest of the
+system than killing the group. Otherwise, memory.max is there to
+limit this type of spillover and ultimately contain buggy or even
+malicious applications.
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
deleted file mode 100644
index 781b1d475bcf..000000000000
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ /dev/null
@@ -1,647 +0,0 @@
-
-Cgroup unified hierarchy
-
-April, 2014 Tejun Heo <tj@kernel.org>
-
-This document describes the changes made by unified hierarchy and
-their rationales. It will eventually be merged into the main cgroup
-documentation.
-
-CONTENTS
-
-1. Background
-2. Basic Operation
- 2-1. Mounting
- 2-2. cgroup.subtree_control
- 2-3. cgroup.controllers
-3. Structural Constraints
- 3-1. Top-down
- 3-2. No internal tasks
-4. Delegation
- 4-1. Model of delegation
- 4-2. Common ancestor rule
-5. Other Changes
- 5-1. [Un]populated Notification
- 5-2. Other Core Changes
- 5-3. Controller File Conventions
- 5-3-1. Format
- 5-3-2. Control Knobs
- 5-4. Per-Controller Changes
- 5-4-1. io
- 5-4-2. cpuset
- 5-4-3. memory
-6. Planned Changes
- 6-1. CAP for resource control
-
-
-1. Background
-
-cgroup allows an arbitrary number of hierarchies and each hierarchy
-can host any number of controllers. While this seems to provide a
-high level of flexibility, it isn't quite useful in practice.
-
-For example, as there is only one instance of each controller, utility
-type controllers such as freezer which can be useful in all
-hierarchies can only be used in one. The issue is exacerbated by the
-fact that controllers can't be moved around once hierarchies are
-populated. Another issue is that all controllers bound to a hierarchy
-are forced to have exactly the same view of the hierarchy. It isn't
-possible to vary the granularity depending on the specific controller.
-
-In practice, these issues heavily limit which controllers can be put
-on the same hierarchy and most configurations resort to putting each
-controller on its own hierarchy. Only closely related ones, such as
-the cpu and cpuacct controllers, make sense to put on the same
-hierarchy. This often means that userland ends up managing multiple
-similar hierarchies repeating the same steps on each hierarchy
-whenever a hierarchy management operation is necessary.
-
-Unfortunately, support for multiple hierarchies comes at a steep cost.
-Internal implementation in cgroup core proper is dazzlingly
-complicated but more importantly the support for multiple hierarchies
-restricts how cgroup is used in general and what controllers can do.
-
-There's no limit on how many hierarchies there may be, which means
-that a task's cgroup membership can't be described in finite length.
-The key may contain any varying number of entries and is unlimited in
-length, which makes it highly awkward to handle and leads to addition
-of controllers which exist only to identify membership, which in turn
-exacerbates the original problem.
-
-Also, as a controller can't have any expectation regarding what shape
-of hierarchies other controllers would be on, each controller has to
-assume that all other controllers are operating on completely
-orthogonal hierarchies. This makes it impossible, or at least very
-cumbersome, for controllers to cooperate with each other.
-
-In most use cases, putting controllers on hierarchies which are
-completely orthogonal to each other isn't necessary. What usually is
-called for is the ability to have differing levels of granularity
-depending on the specific controller. In other words, hierarchy may
-be collapsed from leaf towards root when viewed from specific
-controllers. For example, a given configuration might not care about
-how memory is distributed beyond a certain level while still wanting
-to control how CPU cycles are distributed.
-
-Unified hierarchy is the next version of cgroup interface. It aims to
-address the aforementioned issues by having more structure while
-retaining enough flexibility for most use cases. Various other
-general and controller-specific interface issues are also addressed in
-the process.
-
-
-2. Basic Operation
-
-2-1. Mounting
-
-Currently, unified hierarchy can be mounted with the following mount
-command. Note that this is still under development and scheduled to
-change soon.
-
- mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
-
-All controllers which support the unified hierarchy and are not bound
-to other hierarchies are automatically bound to unified hierarchy and
-show up at the root of it. Controllers which are enabled only in the
-root of unified hierarchy can be bound to other hierarchies. This
-allows mixing unified hierarchy with the traditional multiple
-hierarchies in a fully backward compatible way.
-
-A controller can be moved across hierarchies only after the controller
-is no longer referenced in its current hierarchy. Because per-cgroup
-controller states are destroyed asynchronously and controllers may
-have lingering references, a controller may not show up immediately on
-the unified hierarchy after the final umount of the previous
-hierarchy. Similarly, a controller should be fully disabled to be
-moved out of the unified hierarchy and it may take some time for the
-disabled controller to become available for other hierarchies;
-furthermore, due to dependencies among controllers, other controllers
-may need to be disabled too.
-
-While useful for development and manual configurations, dynamically
-moving controllers between the unified and other hierarchies is
-strongly discouraged for production use. It is recommended to decide
-the hierarchies and controller associations before starting using the
-controllers.
-
-
-2-2. cgroup.subtree_control
-
-All cgroups on unified hierarchy have a "cgroup.subtree_control" file
-which governs which controllers are enabled on the children of the
-cgroup. Let's assume a hierarchy like the following.
-
- root - A - B - C
- \ D
-
-root's "cgroup.subtree_control" file determines which controllers are
-enabled on A. A's on B. B's on C and D. This coincides with the
-fact that controllers on the immediate sub-level are used to
-distribute the resources of the parent. In fact, it's natural to
-assume that resource control knobs of a child belong to its parent.
-Enabling a controller in a "cgroup.subtree_control" file declares that
-distribution of the respective resources of the cgroup will be
-controlled. Note that this means that controller enable states are
-shared among siblings.
-
-When read, the file contains a space-separated list of currently
-enabled controllers. A write to the file should contain a
-space-separated list of controllers with '+' or '-' prefixed (without
-the quotes). Controllers prefixed with '+' are enabled and '-'
-disabled. If a controller is listed multiple times, the last entry
-wins. The specific operations are executed atomically - either all
-succeed or fail.
-
-
-2-3. cgroup.controllers
-
-Read-only "cgroup.controllers" file contains a space-separated list of
-controllers which can be enabled in the cgroup's
-"cgroup.subtree_control" file.
-
-In the root cgroup, this lists controllers which are not bound to
-other hierarchies and the content changes as controllers are bound to
-and unbound from other hierarchies.
-
-In non-root cgroups, the content of this file equals that of the
-parent's "cgroup.subtree_control" file as only controllers enabled
-from the parent can be used in its children.
-
-
-3. Structural Constraints
-
-3-1. Top-down
-
-As it doesn't make sense to nest control of an uncontrolled resource,
-all non-root "cgroup.subtree_control" files can only contain
-controllers which are enabled in the parent's "cgroup.subtree_control"
-file. A controller can be enabled only if the parent has the
-controller enabled and a controller can't be disabled if one or more
-children have it enabled.
-
-
-3-2. No internal tasks
-
-One long-standing issue that cgroup faces is the competition between
-tasks belonging to the parent cgroup and its children cgroups. This
-is inherently nasty as two different types of entities compete and
-there is no agreed-upon obvious way to handle it. Different
-controllers are doing different things.
-
-The cpu controller considers tasks and cgroups as equivalents and maps
-nice levels to cgroup weights. This works for some cases but falls
-flat when children should be allocated specific ratios of CPU cycles
-and the number of internal tasks fluctuates - the ratios constantly
-change as the number of competing entities fluctuates. There also are
-other issues. The mapping from nice level to weight isn't obvious or
-universal, and there are various other knobs which simply aren't
-available for tasks.
-
-The io controller implicitly creates a hidden leaf node for each
-cgroup to host the tasks. The hidden leaf has its own copies of all
-the knobs with "leaf_" prefixed. While this allows equivalent control
-over internal tasks, it's with serious drawbacks. It always adds an
-extra layer of nesting which may not be necessary, makes the interface
-messy and significantly complicates the implementation.
-
-The memory controller currently doesn't have a way to control what
-happens between internal tasks and child cgroups and the behavior is
-not clearly defined. There have been attempts to add ad-hoc behaviors
-and knobs to tailor the behavior to specific workloads. Continuing
-this direction will lead to problems which will be extremely difficult
-to resolve in the long term.
-
-Multiple controllers struggle with internal tasks and came up with
-different ways to deal with it; unfortunately, all the approaches in
-use now are severely flawed and, furthermore, the widely different
-behaviors make cgroup as whole highly inconsistent.
-
-It is clear that this is something which needs to be addressed from
-cgroup core proper in a uniform way so that controllers don't need to
-worry about it and cgroup as a whole shows a consistent and logical
-behavior. To achieve that, unified hierarchy enforces the following
-structural constraint:
-
- Except for the root, only cgroups which don't contain any task may
- have controllers enabled in their "cgroup.subtree_control" files.
-
-Combined with other properties, this guarantees that, when a
-controller is looking at the part of the hierarchy which has it
-enabled, tasks are always only on the leaves. This rules out
-situations where child cgroups compete against internal tasks of the
-parent.
-
-There are two things to note. Firstly, the root cgroup is exempt from
-the restriction. Root contains tasks and anonymous resource
-consumption which can't be associated with any other cgroup and
-requires special treatment from most controllers. How resource
-consumption in the root cgroup is governed is up to each controller.
-
-Secondly, the restriction doesn't take effect if there is no enabled
-controller in the cgroup's "cgroup.subtree_control" file. This is
-important as otherwise it wouldn't be possible to create children of a
-populated cgroup. To control resource distribution of a cgroup, the
-cgroup must create children and transfer all its tasks to the children
-before enabling controllers in its "cgroup.subtree_control" file.
-
-
-4. Delegation
-
-4-1. Model of delegation
-
-A cgroup can be delegated to a less privileged user by granting write
-access of the directory and its "cgroup.procs" file to the user. Note
-that the resource control knobs in a given directory concern the
-resources of the parent and thus must not be delegated along with the
-directory.
-
-Once delegated, the user can build sub-hierarchy under the directory,
-organize processes as it sees fit and further distribute the resources
-it got from the parent. The limits and other settings of all resource
-controllers are hierarchical and regardless of what happens in the
-delegated sub-hierarchy, nothing can escape the resource restrictions
-imposed by the parent.
-
-Currently, cgroup doesn't impose any restrictions on the number of
-cgroups in or nesting depth of a delegated sub-hierarchy; however,
-this may in the future be limited explicitly.
-
-
-4-2. Common ancestor rule
-
-On the unified hierarchy, to write to a "cgroup.procs" file, in
-addition to the usual write permission to the file and uid match, the
-writer must also have write access to the "cgroup.procs" file of the
-common ancestor of the source and destination cgroups. This prevents
-delegatees from smuggling processes across disjoint sub-hierarchies.
-
-Let's say cgroups C0 and C1 have been delegated to user U0 who created
-C00, C01 under C0 and C10 under C1 as follows.
-
- ~~~~~~~~~~~~~ - C0 - C00
- ~ cgroup ~ \ C01
- ~ hierarchy ~
- ~~~~~~~~~~~~~ - C1 - C10
-
-C0 and C1 are separate entities in terms of resource distribution
-regardless of their relative positions in the hierarchy. The
-resources the processes under C0 are entitled to are controlled by
-C0's ancestors and may be completely different from C1. It's clear
-that the intention of delegating C0 to U0 is allowing U0 to organize
-the processes under C0 and further control the distribution of C0's
-resources.
-
-On traditional hierarchies, if a task has write access to "tasks" or
-"cgroup.procs" file of a cgroup and its uid agrees with the target, it
-can move the target to the cgroup. In the above example, U0 will not
-only be able to move processes in each sub-hierarchy but also across
-the two sub-hierarchies, effectively allowing it to violate the
-organizational and resource restrictions implied by the hierarchical
-structure above C0 and C1.
-
-On the unified hierarchy, let's say U0 wants to write the pid of a
-process which has a matching uid and is currently in C10 into
-"C00/cgroup.procs". U0 obviously has write access to the file and
-migration permission on the process; however, the common ancestor of
-the source cgroup C10 and the destination cgroup C00 is above the
-points of delegation and U0 would not have write access to its
-"cgroup.procs" and thus be denied with -EACCES.
-
-
-5. Other Changes
-
-5-1. [Un]populated Notification
-
-cgroup users often need a way to determine when a cgroup's
-subhierarchy becomes empty so that it can be cleaned up. cgroup
-currently provides release_agent for it; unfortunately, this mechanism
-is riddled with issues.
-
-- It delivers events by forking and execing a userland binary
- specified as the release_agent. This is a long deprecated method of
- notification delivery. It's extremely heavy, slow and cumbersome to
- integrate with larger infrastructure.
-
-- There is single monitoring point at the root. There's no way to
- delegate management of a subtree.
-
-- The event isn't recursive. It triggers when a cgroup doesn't have
- any tasks or child cgroups. Events for internal nodes trigger only
- after all children are removed. This again makes it impossible to
- delegate management of a subtree.
-
-- Events are filtered from the kernel side. A "notify_on_release"
- file is used to subscribe to or suppress release events. This is
- unnecessarily complicated and probably done this way because event
- delivery itself was expensive.
-
-Unified hierarchy implements "populated" field in "cgroup.events"
-interface file which can be used to monitor whether the cgroup's
-subhierarchy has tasks in it or not. Its value is 0 if there is no
-task in the cgroup and its descendants; otherwise, 1. poll and
-[id]notify events are triggered when the value changes.
-
-This is significantly lighter and simpler and trivially allows
-delegating management of subhierarchy - subhierarchy monitoring can
-block further propagation simply by putting itself or another process
-in the subhierarchy and monitor events that it's interested in from
-there without interfering with monitoring higher in the tree.
-
-In unified hierarchy, the release_agent mechanism is no longer
-supported and the interface files "release_agent" and
-"notify_on_release" do not exist.
-
-
-5-2. Other Core Changes
-
-- None of the mount options is allowed.
-
-- remount is disallowed.
-
-- rename(2) is disallowed.
-
-- The "tasks" file is removed. Everything should at process
- granularity. Use the "cgroup.procs" file instead.
-
-- The "cgroup.procs" file is not sorted. pids will be unique unless
- they got recycled in-between reads.
-
-- The "cgroup.clone_children" file is removed.
-
-- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
- to before exiting. If the cgroup is removed before the zombie is
- reaped, " (deleted)" is appeneded to the path.
-
-
-5-3. Controller File Conventions
-
-5-3-1. Format
-
-In general, all controller files should be in one of the following
-formats whenever possible.
-
-- Values only files
-
- VAL0 VAL1...\n
-
-- Flat keyed files
-
- KEY0 VAL0\n
- KEY1 VAL1\n
- ...
-
-- Nested keyed files
-
- KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
- KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
- ...
-
-For a writeable file, the format for writing should generally match
-reading; however, controllers may allow omitting later fields or
-implement restricted shortcuts for most common use cases.
-
-For both flat and nested keyed files, only the values for a single key
-can be written at a time. For nested keyed files, the sub key pairs
-may be specified in any order and not all pairs have to be specified.
-
-
-5-3-2. Control Knobs
-
-- Settings for a single feature should generally be implemented in a
- single file.
-
-- In general, the root cgroup should be exempt from resource control
- and thus shouldn't have resource control knobs.
-
-- If a controller implements ratio based resource distribution, the
- control knob should be named "weight" and have the range [1, 10000]
- and 100 should be the default value. The values are chosen to allow
- enough and symmetric bias in both directions while keeping it
- intuitive (the default is 100%).
-
-- If a controller implements an absolute resource guarantee and/or
- limit, the control knobs should be named "min" and "max"
- respectively. If a controller implements best effort resource
- gurantee and/or limit, the control knobs should be named "low" and
- "high" respectively.
-
- In the above four control files, the special token "max" should be
- used to represent upward infinity for both reading and writing.
-
-- If a setting has configurable default value and specific overrides,
- the default settings should be keyed with "default" and appear as
- the first entry in the file. Specific entries can use "default" as
- its value to indicate inheritance of the default value.
-
-- For events which are not very high frequency, an interface file
- "events" should be created which lists event key value pairs.
- Whenever a notifiable event happens, file modified event should be
- generated on the file.
-
-
-5-4. Per-Controller Changes
-
-5-4-1. io
-
-- blkio is renamed to io. The interface is overhauled anyway. The
- new name is more in line with the other two major controllers, cpu
- and memory, and better suited given that it may be used for cgroup
- writeback without involving block layer.
-
-- Everything including stat is always hierarchical making separate
- recursive stat files pointless and, as no internal node can have
- tasks, leaf weights are meaningless. The operation model is
- simplified and the interface is overhauled accordingly.
-
- io.stat
-
- The stat file. The reported stats are from the point where
- bio's are issued to request_queue. The stats are counted
- independent of which policies are enabled. Each line in the
- file follows the following format. More fields may later be
- added at the end.
-
- $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
-
- io.weight
-
- The weight setting, currently only available and effective if
- cfq-iosched is in use for the target device. The weight is
- between 1 and 10000 and defaults to 100. The first line
- always contains the default weight in the following format to
- use when per-device setting is missing.
-
- default $WEIGHT
-
- Subsequent lines list per-device weights of the following
- format.
-
- $MAJ:$MIN $WEIGHT
-
- Writing "$WEIGHT" or "default $WEIGHT" changes the default
- setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
- while "$MAJ:$MIN default" clears it.
-
- This file is available only on non-root cgroups.
-
- io.max
-
- The maximum bandwidth and/or iops setting, only available if
- blk-throttle is enabled. The file is of the following format.
-
- $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
-
- ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
- read/write IOs per second. "max" indicates no limit. Writing
- to the file follows the same format but the individual
- settings may be omitted or specified in any order.
-
- This file is available only on non-root cgroups.
-
-
-5-4-2. cpuset
-
-- Tasks are kept in empty cpusets after hotplug and take on the masks
- of the nearest non-empty ancestor, instead of being moved to it.
-
-- A task can be moved into an empty cpuset, and again it takes on the
- masks of the nearest non-empty ancestor.
-
-
-5-4-3. memory
-
-- use_hierarchy is on by default and the cgroup file for the flag is
- not created.
-
-- The original lower boundary, the soft limit, is defined as a limit
- that is per default unset. As a result, the set of cgroups that
- global reclaim prefers is opt-in, rather than opt-out. The costs
- for optimizing these mostly negative lookups are so high that the
- implementation, despite its enormous size, does not even provide the
- basic desirable behavior. First off, the soft limit has no
- hierarchical meaning. All configured groups are organized in a
- global rbtree and treated like equal peers, regardless where they
- are located in the hierarchy. This makes subtree delegation
- impossible. Second, the soft limit reclaim pass is so aggressive
- that it not just introduces high allocation latencies into the
- system, but also impacts system performance due to overreclaim, to
- the point where the feature becomes self-defeating.
-
- The memory.low boundary on the other hand is a top-down allocated
- reserve. A cgroup enjoys reclaim protection when it and all its
- ancestors are below their low boundaries, which makes delegation of
- subtrees possible. Secondly, new cgroups have no reserve per
- default and in the common case most cgroups are eligible for the
- preferred reclaim pass. This allows the new low boundary to be
- efficiently implemented with just a minor addition to the generic
- reclaim code, without the need for out-of-band data structures and
- reclaim passes. Because the generic reclaim code considers all
- cgroups except for the ones running low in the preferred first
- reclaim pass, overreclaim of individual groups is eliminated as
- well, resulting in much better overall workload performance.
-
-- The original high boundary, the hard limit, is defined as a strict
- limit that can not budge, even if the OOM killer has to be called.
- But this generally goes against the goal of making the most out of
- the available memory. The memory consumption of workloads varies
- during runtime, and that requires users to overcommit. But doing
- that with a strict upper limit requires either a fairly accurate
- prediction of the working set size or adding slack to the limit.
- Since working set size estimation is hard and error prone, and
- getting it wrong results in OOM kills, most users tend to err on the
- side of a looser limit and end up wasting precious resources.
-
- The memory.high boundary on the other hand can be set much more
- conservatively. When hit, it throttles allocations by forcing them
- into direct reclaim to work off the excess, but it never invokes the
- OOM killer. As a result, a high boundary that is chosen too
- aggressively will not terminate the processes, but instead it will
- lead to gradual performance degradation. The user can monitor this
- and make corrections until the minimal memory footprint that still
- gives acceptable performance is found.
-
- In extreme cases, with many concurrent allocations and a complete
- breakdown of reclaim progress within the group, the high boundary
- can be exceeded. But even then it's mostly better to satisfy the
- allocation from the slack available in other groups or the rest of
- the system than killing the group. Otherwise, memory.max is there
- to limit this type of spillover and ultimately contain buggy or even
- malicious applications.
-
-- The original control file names are unwieldy and inconsistent in
- many different ways. For example, the upper boundary hit count is
- exported in the memory.failcnt file, but an OOM event count has to
- be manually counted by listening to memory.oom_control events, and
- lower boundary / soft limit events have to be counted by first
- setting a threshold for that value and then counting those events.
- Also, usage and limit files encode their units in the filename.
- That makes the filenames very long, even though this is not
- information that a user needs to be reminded of every time they type
- out those names.
-
- To address these naming issues, as well as to signal clearly that
- the new interface carries a new configuration model, the naming
- conventions in it necessarily differ from the old interface.
-
-- The original limit files indicate the state of an unset limit with a
- Very High Number, and a configured limit can be unset by echoing -1
- into those files. But that very high number is implementation and
- architecture dependent and not very descriptive. And while -1 can
- be understood as an underflow into the highest possible value, -2 or
- -10M etc. do not work, so it's not consistent.
-
- memory.low, memory.high, and memory.max will use the string "max" to
- indicate and set the highest possible value.
-
-6. Planned Changes
-
-6-1. CAP for resource control
-
-Unified hierarchy will require one of the capabilities(7), which is
-yet to be decided, for all resource control related knobs. Process
-organization operations - creation of sub-cgroups and migration of
-processes in sub-hierarchies may be delegated by changing the
-ownership and/or permissions on the cgroup directory and
-"cgroup.procs" interface file; however, all operations which affect
-resource control - writes to a "cgroup.subtree_control" file or any
-controller-specific knobs - will require an explicit CAP privilege.
-
-This, in part, is to prevent the cgroup interface from being
-inadvertently promoted to programmable API used by non-privileged
-binaries. cgroup exposes various aspects of the system in ways which
-aren't properly abstracted for direct consumption by regular programs.
-This is an administration interface much closer to sysctl knobs than
-system calls. Even the basic access model, being filesystem path
-based, isn't suitable for direct consumption. There's no way to
-access "my cgroup" in a race-free way or make multiple operations
-atomic against migration to another cgroup.
-
-Another aspect is that, for better or for worse, the cgroup interface
-goes through far less scrutiny than regular interfaces for
-unprivileged userland. The upside is that cgroup is able to expose
-useful features which may not be suitable for general consumption in a
-reasonable time frame. It provides a relatively short path between
-internal details and userland-visible interface. Of course, this
-shortcut comes with high risk. We go through what we go through for
-general kernel APIs for good reasons. It may end up leaking internal
-details in a way which can exert significant pain by locking the
-kernel into a contract that can't be maintained in a reasonable
-manner.
-
-Also, due to the specific nature, cgroup and its controllers don't
-tend to attract attention from a wide scope of developers. cgroup's
-short history is already fraught with severely mis-designed
-interfaces, unnecessary commitments to and exposing of internal
-details, broken and dangerous implementations of various features.
-
-Keeping cgroup as an administration interface is both advantageous for
-its role and imperative given its nature. Some of the cgroup features
-may make sense for unprivileged access. If deemed justified, those
-must be further abstracted and implemented as a different interface,
-be it a system call or process-private filesystem, and survive through
-the scrutiny that any interface for general consumption is required to
-go through.
-
-Requiring CAP is not a complete solution but should serve as a
-significant deterrent against spraying cgroup usages in non-privileged
-programs.
diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index 0cb44dc21f97..601256fe8c0d 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -45,21 +45,10 @@ Devices supporting OPPs must set their "operating-points-v2" property with
phandle to a OPP table in their DT node. The OPP core will use this phandle to
find the operating points for the device.
-Devices may want to choose OPP tables at runtime and so can provide a list of
-phandles here. But only *one* of them should be chosen at runtime. This must be
-accompanied by a corresponding "operating-points-names" property, to uniquely
-identify the OPP tables.
-
If required, this can be extended for SoC vendor specfic bindings. Such bindings
should be documented as Documentation/devicetree/bindings/power/<vendor>-opp.txt
and should have a compatible description like: "operating-points-v2-<vendor>".
-Optional properties:
-- operating-points-names: Names of OPP tables (required if multiple OPP
- tables are present), to uniquely identify them. The same list must be present
- for all the CPUs which are sharing clock/voltage rails and hence the OPP
- tables.
-
* OPP Table Node
This describes the OPPs belonging to a device. This node can have following
@@ -100,6 +89,14 @@ Optional properties:
Entries for multiple regulators must be present in the same order as
regulators are specified in device's DT node.
+- opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to
+ the above opp-microvolt property, but allows multiple voltage ranges to be
+ provided for the same OPP. At runtime, the platform can pick a <name> and
+ matching opp-microvolt-<name> property will be enabled for all OPPs. If the
+ platform doesn't pick a specific <name> or the <name> doesn't match with any
+ opp-microvolt-<name> properties, then opp-microvolt property shall be used, if
+ present.
+
- opp-microamp: The maximum current drawn by the device in microamperes
considering system specific parameters (such as transients, process, aging,
maximum operating temperature range etc.) as necessary. This may be used to
@@ -112,6 +109,9 @@ Optional properties:
for few regulators, then this should be marked as zero for them. If it isn't
required for any regulator, then this property need not be present.
+- opp-microamp-<name>: Named opp-microamp property. Similar to
+ opp-microvolt-<name> property, but for microamp instead.
+
- clock-latency-ns: Specifies the maximum possible transition latency (in
nanoseconds) for switching to this OPP from any other OPP.
@@ -123,6 +123,26 @@ Optional properties:
- opp-suspend: Marks the OPP to be used during device suspend. Only one OPP in
the table should have this.
+- opp-supported-hw: This enables us to select only a subset of OPPs from the
+ larger OPP table, based on what version of the hardware we are running on. We
+ still can't have multiple nodes with the same opp-hz value in OPP table.
+
+ It's an user defined array containing a hierarchy of hardware version numbers,
+ supported by the OPP. For example: a platform with hierarchy of three levels
+ of versions (A, B and C), this field should be like <X Y Z>, where X
+ corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z
+ corresponds to version hierarchy C.
+
+ Each level of hierarchy is represented by a 32 bit value, and so there can be
+ only 32 different supported version per hierarchy. i.e. 1 bit per version. A
+ value of 0xFFFFFFFF will enable the OPP for all versions for that hierarchy
+ level. And a value of 0x00000000 will disable the OPP completely, and so we
+ never want that to happen.
+
+ If 32 values aren't sufficient for a version hierarchy, than that version
+ hierarchy can be contained in multiple 32 bit values. i.e. <X Y Z1 Z2> in the
+ above example, Z1 & Z2 refer to the version hierarchy Z.
+
- status: Marks the node enabled/disabled.
Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
@@ -157,20 +177,20 @@ Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
compatible = "operating-points-v2";
opp-shared;
- opp00 {
+ opp@1000000000 {
opp-hz = /bits/ 64 <1000000000>;
opp-microvolt = <970000 975000 985000>;
opp-microamp = <70000>;
clock-latency-ns = <300000>;
opp-suspend;
};
- opp01 {
+ opp@1100000000 {
opp-hz = /bits/ 64 <1100000000>;
opp-microvolt = <980000 1000000 1010000>;
opp-microamp = <80000>;
clock-latency-ns = <310000>;
};
- opp02 {
+ opp@1200000000 {
opp-hz = /bits/ 64 <1200000000>;
opp-microvolt = <1025000>;
clock-latency-ns = <290000>;
@@ -236,20 +256,20 @@ independently.
* independently.
*/
- opp00 {
+ opp@1000000000 {
opp-hz = /bits/ 64 <1000000000>;
opp-microvolt = <970000 975000 985000>;
opp-microamp = <70000>;
clock-latency-ns = <300000>;
opp-suspend;
};
- opp01 {
+ opp@1100000000 {
opp-hz = /bits/ 64 <1100000000>;
opp-microvolt = <980000 1000000 1010000>;
opp-microamp = <80000>;
clock-latency-ns = <310000>;
};
- opp02 {
+ opp@1200000000 {
opp-hz = /bits/ 64 <1200000000>;
opp-microvolt = <1025000>;
opp-microamp = <90000;
@@ -312,20 +332,20 @@ DVFS state together.
compatible = "operating-points-v2";
opp-shared;
- opp00 {
+ opp@1000000000 {
opp-hz = /bits/ 64 <1000000000>;
opp-microvolt = <970000 975000 985000>;
opp-microamp = <70000>;
clock-latency-ns = <300000>;
opp-suspend;
};
- opp01 {
+ opp@1100000000 {
opp-hz = /bits/ 64 <1100000000>;
opp-microvolt = <980000 1000000 1010000>;
opp-microamp = <80000>;
clock-latency-ns = <310000>;
};
- opp02 {
+ opp@1200000000 {
opp-hz = /bits/ 64 <1200000000>;
opp-microvolt = <1025000>;
opp-microamp = <90000>;
@@ -338,20 +358,20 @@ DVFS state together.
compatible = "operating-points-v2";
opp-shared;
- opp10 {
+ opp@1300000000 {
opp-hz = /bits/ 64 <1300000000>;
opp-microvolt = <1045000 1050000 1055000>;
opp-microamp = <95000>;
clock-latency-ns = <400000>;
opp-suspend;
};
- opp11 {
+ opp@1400000000 {
opp-hz = /bits/ 64 <1400000000>;
opp-microvolt = <1075000>;
opp-microamp = <100000>;
clock-latency-ns = <400000>;
};
- opp12 {
+ opp@1500000000 {
opp-hz = /bits/ 64 <1500000000>;
opp-microvolt = <1010000 1100000 1110000>;
opp-microamp = <95000>;
@@ -378,7 +398,7 @@ Example 4: Handling multiple regulators
compatible = "operating-points-v2";
opp-shared;
- opp00 {
+ opp@1000000000 {
opp-hz = /bits/ 64 <1000000000>;
opp-microvolt = <970000>, /* Supply 0 */
<960000>, /* Supply 1 */
@@ -391,7 +411,7 @@ Example 4: Handling multiple regulators
/* OR */
- opp00 {
+ opp@1000000000 {
opp-hz = /bits/ 64 <1000000000>;
opp-microvolt = <970000 975000 985000>, /* Supply 0 */
<960000 965000 975000>, /* Supply 1 */
@@ -404,7 +424,7 @@ Example 4: Handling multiple regulators
/* OR */
- opp00 {
+ opp@1000000000 {
opp-hz = /bits/ 64 <1000000000>;
opp-microvolt = <970000 975000 985000>, /* Supply 0 */
<960000 965000 975000>, /* Supply 1 */
@@ -417,7 +437,8 @@ Example 4: Handling multiple regulators
};
};
-Example 5: Multiple OPP tables
+Example 5: opp-supported-hw
+(example: three level hierarchy of versions: cuts, substrate and process)
/ {
cpus {
@@ -426,40 +447,73 @@ Example 5: Multiple OPP tables
...
cpu-supply = <&cpu_supply>
- operating-points-v2 = <&cpu0_opp_table_slow>, <&cpu0_opp_table_fast>;
- operating-points-names = "slow", "fast";
+ operating-points-v2 = <&cpu0_opp_table_slow>;
};
};
- cpu0_opp_table_slow: opp_table_slow {
+ opp_table {
compatible = "operating-points-v2";
status = "okay";
opp-shared;
- opp00 {
+ opp@600000000 {
+ /*
+ * Supports all substrate and process versions for 0xF
+ * cuts, i.e. only first four cuts.
+ */
+ opp-supported-hw = <0xF 0xFFFFFFFF 0xFFFFFFFF>
opp-hz = /bits/ 64 <600000000>;
+ opp-microvolt = <900000 915000 925000>;
...
};
- opp01 {
+ opp@800000000 {
+ /*
+ * Supports:
+ * - cuts: only one, 6th cut (represented by 6th bit).
+ * - substrate: supports 16 different substrate versions
+ * - process: supports 9 different process versions
+ */
+ opp-supported-hw = <0x20 0xff0000ff 0x0000f4f0>
opp-hz = /bits/ 64 <800000000>;
+ opp-microvolt = <900000 915000 925000>;
...
};
};
+};
+
+Example 6: opp-microvolt-<name>, opp-microamp-<name>:
+(example: device with two possible microvolt ranges: slow and fast)
- cpu0_opp_table_fast: opp_table_fast {
+/ {
+ cpus {
+ cpu@0 {
+ compatible = "arm,cortex-a7";
+ ...
+
+ operating-points-v2 = <&cpu0_opp_table>;
+ };
+ };
+
+ cpu0_opp_table: opp_table0 {
compatible = "operating-points-v2";
- status = "okay";
opp-shared;
- opp10 {
+ opp@1000000000 {
opp-hz = /bits/ 64 <1000000000>;
- ...
+ opp-microvolt-slow = <900000 915000 925000>;
+ opp-microvolt-fast = <970000 975000 985000>;
+ opp-microamp-slow = <70000>;
+ opp-microamp-fast = <71000>;
};
- opp11 {
- opp-hz = /bits/ 64 <1100000000>;
- ...
+ opp@1200000000 {
+ opp-hz = /bits/ 64 <1200000000>;
+ opp-microvolt-slow = <900000 915000 925000>, /* Supply vcc0 */
+ <910000 925000 935000>; /* Supply vcc1 */
+ opp-microvolt-fast = <970000 975000 985000>, /* Supply vcc0 */
+ <960000 965000 975000>; /* Supply vcc1 */
+ opp-microamp = <70000>; /* Will be used for both slow/fast */
};
};
};
diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt
index e63316239938..4199ffecc0ff 100644
--- a/Documentation/features/time/irq-time-acct/arch-support.txt
+++ b/Documentation/features/time/irq-time-acct/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | .. |
| arc: | TODO |
| arm: | ok |
- | arm64: | .. |
+ | arm64: | ok |
| avr32: | TODO |
| blackfin: | TODO |
| c6x: | TODO |
diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt
index af6816bccb43..df1d1f3c9af2 100644
--- a/Documentation/features/vm/huge-vmap/arch-support.txt
+++ b/Documentation/features/vm/huge-vmap/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | TODO |
| arm: | TODO |
- | arm64: | TODO |
+ | arm64: | ok |
| avr32: | TODO |
| blackfin: | TODO |
| c6x: | TODO |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c360f80c3473..5ee92cc9e578 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3416,6 +3416,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
ro [KNL] Mount root device read-only on boot
+ rodata= [KNL]
+ on Mark read-only kernel memory as read-only (default).
+ off Leave read-only kernel memory writable for debugging.
+
root= [KNL] Root filesystem
See name_to_dev_t comment in init/do_mounts.c.
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt
index 0a5c3290e732..a33c88cd5d1d 100644
--- a/Documentation/trace/coresight.txt
+++ b/Documentation/trace/coresight.txt
@@ -190,8 +190,8 @@ expected to be accessed and controlled using those entries.
Last but not least, "struct module *owner" is expected to be set to reflect
the information carried in "THIS_MODULE".
-How to use
-----------
+How to use the tracer modules
+-----------------------------
Before trace collection can start, a coresight sink needs to be identify.
There is no limit on the amount of sinks (nor sources) that can be enabled at
@@ -297,3 +297,36 @@ Info Tracing enabled
Instruction 13570831 0x8026B584 E28DD00C false ADD sp,sp,#0xc
Instruction 0 0x8026B588 E8BD8000 true LDM sp!,{pc}
Timestamp Timestamp: 17107041535
+
+How to use the STM module
+-------------------------
+
+Using the System Trace Macrocell module is the same as the tracers - the only
+difference is that clients are driving the trace capture rather
+than the program flow through the code.
+
+As with any other CoreSight component, specifics about the STM tracer can be
+found in sysfs with more information on each entry being found in [1]:
+
+root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
+enable_source hwevent_select port_enable subsystem uevent
+hwevent_enable mgmt port_select traceid
+root@genericarmv8:~#
+
+Like any other source a sink needs to be identified and the STM enabled before
+being used:
+
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+
+From there user space applications can request and use channels using the devfs
+interface provided for that purpose by the generic STM API:
+
+root@genericarmv8:~# ls -l /dev/20100000.stm
+crw------- 1 root root 10, 61 Jan 3 18:11 /dev/20100000.stm
+root@genericarmv8:~#
+
+Details on how to use the generic STM API can be found here [2].
+
+[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
+[2]. Documentation/trace/stm.txt
diff --git a/MAINTAINERS b/MAINTAINERS
index ab65bbecb159..7875f7b71546 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1007,6 +1007,10 @@ F: drivers/hwtracing/coresight/*
F: Documentation/trace/coresight.txt
F: Documentation/devicetree/bindings/arm/coresight.txt
F: Documentation/ABI/testing/sysfs-bus-coresight-devices-*
+F: tools/perf/arch/arm/util/pmu.c
+F: tools/perf/arch/arm/util/auxtrace.c
+F: tools/perf/arch/arm/util/cs_etm.c
+F: tools/perf/arch/arm/util/cs_etm.h
ARM/CORGI MACHINE SUPPORT
M: Richard Purdie <rpurdie@rpsys.net>
@@ -9356,6 +9360,7 @@ F: drivers/mmc/host/dw_mmc*
SYSTEM TRACE MODULE CLASS
M: Alexander Shishkin <alexander.shishkin@linux.intel.com>
S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/ash/stm.git
F: Documentation/trace/stm.txt
F: drivers/hwtracing/stm/
F: include/linux/stm.h
diff --git a/arch/Kconfig b/arch/Kconfig
index 4e949e58b192..d4d9845530f1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -423,6 +423,15 @@ config CC_STACKPROTECTOR_STRONG
endchoice
+config HAVE_ARCH_WITHIN_STACK_FRAMES
+ bool
+ help
+ An architecture should select this if it can walk the kernel stack
+ frames to determine if an object is part of either the arguments
+ or local variables (i.e. that it excludes saved return addresses,
+ and similar) by implementing an inline arch_within_stack_frames(),
+ which is used by CONFIG_HARDENED_USERCOPY.
+
config HAVE_CONTEXT_TRACKING
bool
help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 34e1569a11ee..9049ac023bee 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -35,8 +35,10 @@ config ARM
select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
+ select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
select HAVE_ARCH_TRACEHOOK
+ select HAVE_ARM_SMCCC if CPU_V7
select HAVE_BPF_JIT
select HAVE_CC_STACKPROTECTOR
select HAVE_CONTEXT_TRACKING
@@ -1422,8 +1424,7 @@ config BIG_LITTLE
config BL_SWITCHER
bool "big.LITTLE switcher support"
- depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
- select ARM_CPU_SUSPEND
+ depends on BIG_LITTLE && MCPM && HOTPLUG_CPU && ARM_GIC
select CPU_PM
help
The big.LITTLE "switcher" provides the core functionality to
@@ -1481,7 +1482,7 @@ config HOTPLUG_CPU
config ARM_PSCI
bool "Support for the ARM Power State Coordination Interface (PSCI)"
- depends on CPU_V7
+ depends on HAVE_ARM_SMCCC
select ARM_PSCI_FW
help
Say Y here if you want Linux to communicate with system firmware
@@ -2140,7 +2141,8 @@ config ARCH_SUSPEND_POSSIBLE
def_bool y
config ARM_CPU_SUSPEND
- def_bool PM_SLEEP
+ def_bool PM_SLEEP || BL_SWITCHER || ARM_PSCI_FW
+ depends on ARCH_SUSPEND_POSSIBLE
config ARCH_HIBERNATION_POSSIBLE
bool
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index d5525bfc7e3e..9156fc303afd 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif
#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
#else
diff --git a/arch/arm/include/asm/cpuidle.h b/arch/arm/include/asm/cpuidle.h
index 0f8424924902..3848259bebf8 100644
--- a/arch/arm/include/asm/cpuidle.h
+++ b/arch/arm/include/asm/cpuidle.h
@@ -30,7 +30,7 @@ static inline int arm_cpuidle_simple_enter(struct cpuidle_device *dev,
struct device_node;
struct cpuidle_ops {
- int (*suspend)(int cpu, unsigned long arg);
+ int (*suspend)(unsigned long arg);
int (*init)(struct device_node *, int cpu);
};
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index dc641ddf0784..e22089fb44dc 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -19,6 +19,7 @@
#ifndef __ARM_KVM_ARM_H__
#define __ARM_KVM_ARM_H__
+#include <linux/const.h>
#include <linux/types.h>
/* Hyp Configuration Register (HCR) bits */
@@ -132,10 +133,9 @@
* space.
*/
#define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE (1ULL << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1ULL)
-#define PTRS_PER_S2_PGD (1ULL << (KVM_PHYS_SHIFT - 30))
-#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
+#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT)
+#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL))
+#define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
/* Virtualization Translation Control Register (VTCR) bits */
#define VTCR_SH0 (3 << 12)
@@ -162,17 +162,17 @@
#define VTTBR_X (5 - KVM_T0SZ)
#endif
#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
-#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
-#define VTTBR_VMID_SHIFT (48LLU)
-#define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT)
+#define VTTBR_BADDR_MASK (((_AC(1, ULL) << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
+#define VTTBR_VMID_SHIFT _AC(48, ULL)
+#define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
/* Hyp Syndrome Register (HSR) bits */
#define HSR_EC_SHIFT (26)
-#define HSR_EC (0x3fU << HSR_EC_SHIFT)
-#define HSR_IL (1U << 25)
+#define HSR_EC (_AC(0x3f, UL) << HSR_EC_SHIFT)
+#define HSR_IL (_AC(1, UL) << 25)
#define HSR_ISS (HSR_IL - 1)
#define HSR_ISV_SHIFT (24)
-#define HSR_ISV (1U << HSR_ISV_SHIFT)
+#define HSR_ISV (_AC(1, UL) << HSR_ISV_SHIFT)
#define HSR_SRT_SHIFT (16)
#define HSR_SRT_MASK (0xf << HSR_SRT_SHIFT)
#define HSR_FSC (0x3f)
@@ -180,9 +180,9 @@
#define HSR_SSE (1 << 21)
#define HSR_WNR (1 << 6)
#define HSR_CV_SHIFT (24)
-#define HSR_CV (1U << HSR_CV_SHIFT)
+#define HSR_CV (_AC(1, UL) << HSR_CV_SHIFT)
#define HSR_COND_SHIFT (20)
-#define HSR_COND (0xfU << HSR_COND_SHIFT)
+#define HSR_COND (_AC(0xf, UL) << HSR_COND_SHIFT)
#define FSC_FAULT (0x04)
#define FSC_ACCESS (0x08)
@@ -210,13 +210,13 @@
#define HSR_EC_DABT (0x24)
#define HSR_EC_DABT_HYP (0x25)
-#define HSR_WFI_IS_WFE (1U << 0)
+#define HSR_WFI_IS_WFE (_AC(1, UL) << 0)
-#define HSR_HVC_IMM_MASK ((1UL << 16) - 1)
+#define HSR_HVC_IMM_MASK ((_AC(1, UL) << 16) - 1)
-#define HSR_DABT_S1PTW (1U << 7)
-#define HSR_DABT_CM (1U << 8)
-#define HSR_DABT_EA (1U << 9)
+#define HSR_DABT_S1PTW (_AC(1, UL) << 7)
+#define HSR_DABT_CM (_AC(1, UL) << 8)
+#define HSR_DABT_EA (_AC(1, UL) << 9)
#define kvm_arm_exception_type \
{0, "RESET" }, \
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 194c91b610ff..c35c349da069 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -79,6 +79,8 @@
#define rr_lo_hi(a1, a2) a1, a2
#endif
+#define kvm_ksym_ref(kva) (kva)
+
#ifndef __ASSEMBLY__
struct kvm;
struct kvm_vcpu;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 6692982c9b57..bedaf65c0ff9 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -214,6 +214,19 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
}
+static inline void __cpu_init_stage2(void)
+{
+}
+
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+ phys_addr_t phys_idmap_start)
+{
+ /*
+ * TODO
+ * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
+ */
+}
+
static inline int kvm_arch_dev_ioctl_check_extension(long ext)
{
return 0;
@@ -226,7 +239,6 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
-static inline void kvm_arch_hardware_disable(void) {}
static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 405aa1883307..c7ba9a42e857 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -66,6 +66,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
void kvm_clear_hyp_idmap(void);
@@ -279,6 +280,11 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
pgd_t *merged_hyp_pgd,
unsigned long hyp_idmap_start) { }
+static inline unsigned int kvm_get_vmid_bits(void)
+{
+ return 8;
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h
index 51622ba7c4a6..d3c0c23703b6 100644
--- a/arch/arm/include/asm/ptrace.h
+++ b/arch/arm/include/asm/ptrace.h
@@ -121,7 +121,6 @@ extern unsigned long profile_pc(struct pt_regs *regs);
#define MAX_REG_OFFSET (offsetof(struct pt_regs, ARM_ORIG_r0))
extern int regs_query_register_offset(const char *name);
-extern const char *regs_query_register_name(unsigned int offset);
extern bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr);
extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
unsigned int n);
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 35c9db857ebe..7fb59199c6bb 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
static inline unsigned long __must_check
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
- unsigned int __ua_flags = uaccess_save_and_enable();
+ unsigned int __ua_flags;
+
+ check_object_size(to, n, false);
+ __ua_flags = uaccess_save_and_enable();
n = arm_copy_from_user(to, from, n);
uaccess_restore(__ua_flags);
return n;
@@ -511,11 +514,15 @@ static inline unsigned long __must_check
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
#ifndef CONFIG_UACCESS_WITH_MEMCPY
- unsigned int __ua_flags = uaccess_save_and_enable();
+ unsigned int __ua_flags;
+
+ check_object_size(from, n, true);
+ __ua_flags = uaccess_save_and_enable();
n = arm_copy_to_user(to, from, n);
uaccess_restore(__ua_flags);
return n;
#else
+ check_object_size(from, n, true);
return arm_copy_to_user(to, from, n);
#endif
}
diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index 4371f45c5784..d4ceaf5f299b 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -74,6 +74,15 @@ static inline bool is_hyp_mode_mismatched(void)
{
return !!(__boot_cpu_mode & BOOT_CPU_MODE_MISMATCH);
}
+
+static inline bool is_kernel_in_hyp_mode(void)
+{
+ return false;
+}
+
+/* The section containing the hypervisor text */
+extern char __hyp_text_start[];
+extern char __hyp_text_end[];
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index af9e59bf3831..80856def2465 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -88,8 +88,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_ARM_VIRT_EXT) += hyp-stub.o
ifeq ($(CONFIG_ARM_PSCI),y)
-obj-y += psci-call.o
obj-$(CONFIG_SMP) += psci_smp.o
endif
+obj-$(CONFIG_HAVE_ARM_SMCCC) += smccc-call.o
+
extra-y := $(head-y) vmlinux.lds
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index f89811fb9a55..7e45f69a0ddc 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/arm-smccc.h>
#include <asm/checksum.h>
#include <asm/ftrace.h>
@@ -175,3 +176,8 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
EXPORT_SYMBOL(__pv_phys_pfn_offset);
EXPORT_SYMBOL(__pv_offset);
#endif
+
+#ifdef CONFIG_HAVE_ARM_SMCCC
+EXPORT_SYMBOL(arm_smccc_smc);
+EXPORT_SYMBOL(arm_smccc_hvc);
+#endif
diff --git a/arch/arm/kernel/cpuidle.c b/arch/arm/kernel/cpuidle.c
index 318da33465f4..703926e7007b 100644
--- a/arch/arm/kernel/cpuidle.c
+++ b/arch/arm/kernel/cpuidle.c
@@ -56,7 +56,7 @@ int arm_cpuidle_suspend(int index)
int cpu = smp_processor_id();
if (cpuidle_ops[cpu].suspend)
- ret = cpuidle_ops[cpu].suspend(cpu, index);
+ ret = cpuidle_ops[cpu].suspend(index);
return ret;
}
diff --git a/arch/arm/kernel/psci-call.S b/arch/arm/kernel/psci-call.S
deleted file mode 100644
index a78e9e1e206d..000000000000
--- a/arch/arm/kernel/psci-call.S
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2015 ARM Limited
- *
- * Author: Mark Rutland <mark.rutland@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/opcodes-sec.h>
-#include <asm/opcodes-virt.h>
-
-/* int __invoke_psci_fn_hvc(u32 function_id, u32 arg0, u32 arg1, u32 arg2) */
-ENTRY(__invoke_psci_fn_hvc)
- __HVC(0)
- bx lr
-ENDPROC(__invoke_psci_fn_hvc)
-
-/* int __invoke_psci_fn_smc(u32 function_id, u32 arg0, u32 arg1, u32 arg2) */
-ENTRY(__invoke_psci_fn_smc)
- __SMC(0)
- bx lr
-ENDPROC(__invoke_psci_fn_smc)
diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S
new file mode 100644
index 000000000000..2e48b674aab1
--- /dev/null
+++ b/arch/arm/kernel/smccc-call.S
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/linkage.h>
+
+#include <asm/opcodes-sec.h>
+#include <asm/opcodes-virt.h>
+#include <asm/unwind.h>
+
+ /*
+ * Wrap c macros in asm macros to delay expansion until after the
+ * SMCCC asm macro is expanded.
+ */
+ .macro SMCCC_SMC
+ __SMC(0)
+ .endm
+
+ .macro SMCCC_HVC
+ __HVC(0)
+ .endm
+
+ .macro SMCCC instr
+UNWIND( .fnstart)
+ mov r12, sp
+ push {r4-r7}
+UNWIND( .save {r4-r7})
+ ldm r12, {r4-r7}
+ \instr
+ pop {r4-r7}
+ ldr r12, [sp, #(4 * 4)]
+ stm r12, {r0-r3}
+ bx lr
+UNWIND( .fnend)
+ .endm
+
+/*
+ * void smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
+ * unsigned long a3, unsigned long a4, unsigned long a5,
+ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_smc)
+ SMCCC SMCCC_SMC
+ENDPROC(arm_smccc_smc)
+
+/*
+ * void smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
+ * unsigned long a3, unsigned long a4, unsigned long a5,
+ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_hvc)
+ SMCCC SMCCC_HVC
+ENDPROC(arm_smccc_hvc)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d7bef2144760..4cddf20cdb82 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -16,7 +16,6 @@
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
-#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/errno.h>
#include <linux/err.h>
@@ -44,6 +43,7 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_coproc.h>
#include <asm/kvm_psci.h>
+#include <asm/sections.h>
#ifdef REQUIRES_VIRT
__asm__(".arch_extension virt");
@@ -58,9 +58,14 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
/* The VMID used in the VTTBR */
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
-static u8 kvm_next_vmid;
+static u32 kvm_next_vmid;
+static unsigned int kvm_vmid_bits __read_mostly;
static DEFINE_SPINLOCK(kvm_vmid_lock);
+static bool vgic_present;
+
+static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+
static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
{
BUG_ON(preemptible());
@@ -85,11 +90,6 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
return &kvm_arm_running_vcpu;
}
-int kvm_arch_hardware_enable(void)
-{
- return 0;
-}
-
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@ -132,7 +132,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.vmid_gen = 0;
/* The maximum number of VCPUs is limited by the host's GIC model */
- kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus();
+ kvm->arch.max_vcpus = vgic_present ?
+ kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
return ret;
out_free_stage2_pgd:
@@ -170,6 +171,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
int r;
switch (ext) {
case KVM_CAP_IRQCHIP:
+ r = vgic_present;
+ break;
case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_USER_MEMORY:
@@ -431,11 +434,12 @@ static void update_vttbr(struct kvm *kvm)
kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
kvm->arch.vmid = kvm_next_vmid;
kvm_next_vmid++;
+ kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
/* update vttbr to be used with the new vmid */
pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm));
BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
- vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
+ vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
kvm->arch.vttbr = pgd_phys | vmid;
spin_unlock(&kvm_vmid_lock);
@@ -911,6 +915,8 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
switch (dev_id) {
case KVM_ARM_DEVICE_VGIC_V2:
+ if (!vgic_present)
+ return -ENXIO;
return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
default:
return -ENODEV;
@@ -925,6 +931,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
switch (ioctl) {
case KVM_CREATE_IRQCHIP: {
+ if (!vgic_present)
+ return -ENXIO;
return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
}
case KVM_ARM_SET_DEVICE_ADDR: {
@@ -967,43 +975,99 @@ static void cpu_init_hyp_mode(void *dummy)
pgd_ptr = kvm_mmu_get_httbr();
stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
hyp_stack_ptr = stack_page + PAGE_SIZE;
- vector_ptr = (unsigned long)__kvm_hyp_vector;
+ vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+ __cpu_init_stage2();
kvm_arm_init_debug();
}
-static int hyp_init_cpu_notify(struct notifier_block *self,
- unsigned long action, void *cpu)
+static void cpu_hyp_reinit(void)
{
- switch (action) {
- case CPU_STARTING:
- case CPU_STARTING_FROZEN:
+ if (is_kernel_in_hyp_mode()) {
+ /*
+ * __cpu_init_stage2() is safe to call even if the PM
+ * event was cancelled before the CPU was reset.
+ */
+ __cpu_init_stage2();
+ } else {
if (__hyp_get_vectors() == hyp_default_vectors)
cpu_init_hyp_mode(NULL);
- break;
}
+}
+
+static void cpu_hyp_reset(void)
+{
+ phys_addr_t boot_pgd_ptr;
+ phys_addr_t phys_idmap_start;
+
+ if (!is_kernel_in_hyp_mode()) {
+ boot_pgd_ptr = kvm_mmu_get_boot_httbr();
+ phys_idmap_start = kvm_get_idmap_start();
+
+ __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
+ }
+}
- return NOTIFY_OK;
+static void _kvm_arch_hardware_enable(void *discard)
+{
+ if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+ cpu_hyp_reinit();
+ __this_cpu_write(kvm_arm_hardware_enabled, 1);
+ }
}
-static struct notifier_block hyp_init_cpu_nb = {
- .notifier_call = hyp_init_cpu_notify,
-};
+int kvm_arch_hardware_enable(void)
+{
+ _kvm_arch_hardware_enable(NULL);
+ return 0;
+}
+
+static void _kvm_arch_hardware_disable(void *discard)
+{
+ if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+ cpu_hyp_reset();
+ __this_cpu_write(kvm_arm_hardware_enabled, 0);
+ }
+}
+
+void kvm_arch_hardware_disable(void)
+{
+ _kvm_arch_hardware_disable(NULL);
+}
#ifdef CONFIG_CPU_PM
static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
unsigned long cmd,
void *v)
{
- if (cmd == CPU_PM_EXIT &&
- __hyp_get_vectors() == hyp_default_vectors) {
- cpu_init_hyp_mode(NULL);
+ /*
+ * kvm_arm_hardware_enabled is left with its old value over
+ * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+ * re-enable hyp.
+ */
+ switch (cmd) {
+ case CPU_PM_ENTER:
+ if (__this_cpu_read(kvm_arm_hardware_enabled))
+ /*
+ * don't update kvm_arm_hardware_enabled here
+ * so that the hardware will be re-enabled
+ * when we resume. See below.
+ */
+ cpu_hyp_reset();
+
+ return NOTIFY_OK;
+ case CPU_PM_EXIT:
+ if (__this_cpu_read(kvm_arm_hardware_enabled))
+ /* The hardware was enabled before suspend. */
+ cpu_hyp_reinit();
+
return NOTIFY_OK;
- }
- return NOTIFY_DONE;
+ default:
+ return NOTIFY_DONE;
+ }
}
static struct notifier_block hyp_init_cpu_pm_nb = {
@@ -1020,6 +1084,91 @@ static inline void hyp_cpu_pm_init(void)
}
#endif
+static void teardown_common_resources(void)
+{
+ free_percpu(kvm_host_cpu_state);
+}
+
+static int init_common_resources(void)
+{
+ kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
+ if (!kvm_host_cpu_state) {
+ kvm_err("Cannot allocate host CPU state\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int init_subsystems(void)
+{
+ int err = 0;
+
+ /*
+ * Enable hardware so that subsystem initialisation can access EL2.
+ */
+ on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+
+ /*
+ * Register CPU lower-power notifier
+ */
+ hyp_cpu_pm_init();
+
+ /*
+ * Init HYP view of VGIC
+ */
+ err = kvm_vgic_hyp_init();
+ switch (err) {
+ case 0:
+ vgic_present = true;
+ break;
+ case -ENODEV:
+ case -ENXIO:
+ vgic_present = false;
+ err = 0;
+ break;
+ default:
+ goto out;
+ }
+
+ /*
+ * Init HYP architected timer support
+ */
+ err = kvm_timer_hyp_init();
+ if (err)
+ goto out;
+
+ kvm_perf_init();
+ kvm_coproc_table_init();
+
+out:
+ on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+
+ return err;
+}
+
+static void teardown_hyp_mode(void)
+{
+ int cpu;
+
+ if (is_kernel_in_hyp_mode())
+ return;
+
+ free_hyp_pgds();
+ for_each_possible_cpu(cpu)
+ free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+}
+
+static int init_vhe_mode(void)
+{
+ /* set size of VMID supported by CPU */
+ kvm_vmid_bits = kvm_get_vmid_bits();
+ kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+
+ kvm_info("VHE mode initialized successfully\n");
+ return 0;
+}
+
/**
* Inits Hyp-mode on all online CPUs
*/
@@ -1050,7 +1199,7 @@ static int init_hyp_mode(void)
stack_page = __get_free_page(GFP_KERNEL);
if (!stack_page) {
err = -ENOMEM;
- goto out_free_stack_pages;
+ goto out_err;
}
per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
@@ -1059,10 +1208,18 @@ static int init_hyp_mode(void)
/*
* Map the Hyp-code called directly from the host
*/
- err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
+ err = create_hyp_mappings(kvm_ksym_ref(__kvm_hyp_code_start),
+ kvm_ksym_ref(__kvm_hyp_code_end));
if (err) {
kvm_err("Cannot map world-switch code\n");
- goto out_free_mappings;
+ goto out_err;
+ }
+
+ err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
+ kvm_ksym_ref(__end_rodata));
+ if (err) {
+ kvm_err("Cannot map rodata section\n");
+ goto out_err;
}
/*
@@ -1074,20 +1231,10 @@ static int init_hyp_mode(void)
if (err) {
kvm_err("Cannot map hyp stack\n");
- goto out_free_mappings;
+ goto out_err;
}
}
- /*
- * Map the host CPU structures
- */
- kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
- if (!kvm_host_cpu_state) {
- err = -ENOMEM;
- kvm_err("Cannot allocate host CPU state\n");
- goto out_free_mappings;
- }
-
for_each_possible_cpu(cpu) {
kvm_cpu_context_t *cpu_ctxt;
@@ -1096,46 +1243,24 @@ static int init_hyp_mode(void)
if (err) {
kvm_err("Cannot map host CPU state: %d\n", err);
- goto out_free_context;
+ goto out_err;
}
}
- /*
- * Execute the init code on each CPU.
- */
- on_each_cpu(cpu_init_hyp_mode, NULL, 1);
-
- /*
- * Init HYP view of VGIC
- */
- err = kvm_vgic_hyp_init();
- if (err)
- goto out_free_context;
-
- /*
- * Init HYP architected timer support
- */
- err = kvm_timer_hyp_init();
- if (err)
- goto out_free_context;
-
#ifndef CONFIG_HOTPLUG_CPU
free_boot_hyp_pgd();
#endif
- kvm_perf_init();
+ /* set size of VMID supported by CPU */
+ kvm_vmid_bits = kvm_get_vmid_bits();
+ kvm_info("%d-bit VMID\n", kvm_vmid_bits);
kvm_info("Hyp mode initialized successfully\n");
return 0;
-out_free_context:
- free_percpu(kvm_host_cpu_state);
-out_free_mappings:
- free_hyp_pgds();
-out_free_stack_pages:
- for_each_possible_cpu(cpu)
- free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+
out_err:
+ teardown_hyp_mode();
kvm_err("error initializing Hyp mode: %d\n", err);
return err;
}
@@ -1179,26 +1304,27 @@ int kvm_arch_init(void *opaque)
}
}
- cpu_notifier_register_begin();
-
- err = init_hyp_mode();
+ err = init_common_resources();
if (err)
- goto out_err;
+ return err;
- err = __register_cpu_notifier(&hyp_init_cpu_nb);
- if (err) {
- kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+ if (is_kernel_in_hyp_mode())
+ err = init_vhe_mode();
+ else
+ err = init_hyp_mode();
+ if (err)
goto out_err;
- }
-
- cpu_notifier_register_done();
- hyp_cpu_pm_init();
+ err = init_subsystems();
+ if (err)
+ goto out_hyp;
- kvm_coproc_table_init();
return 0;
+
+out_hyp:
+ teardown_hyp_mode();
out_err:
- cpu_notifier_register_done();
+ teardown_common_resources();
return err;
}
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index d6c005283678..dc99159857b4 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -275,6 +275,40 @@ static u32 exc_vector_base(struct kvm_vcpu *vcpu)
return vbar;
}
+/*
+ * Switch to an exception mode, updating both CPSR and SPSR. Follow
+ * the logic described in AArch32.EnterMode() from the ARMv8 ARM.
+ */
+static void kvm_update_psr(struct kvm_vcpu *vcpu, unsigned long mode)
+{
+ unsigned long cpsr = *vcpu_cpsr(vcpu);
+ u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
+
+ *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | mode;
+
+ switch (mode) {
+ case FIQ_MODE:
+ *vcpu_cpsr(vcpu) |= PSR_F_BIT;
+ /* Fall through */
+ case ABT_MODE:
+ case IRQ_MODE:
+ *vcpu_cpsr(vcpu) |= PSR_A_BIT;
+ /* Fall through */
+ default:
+ *vcpu_cpsr(vcpu) |= PSR_I_BIT;
+ }
+
+ *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
+
+ if (sctlr & SCTLR_TE)
+ *vcpu_cpsr(vcpu) |= PSR_T_BIT;
+ if (sctlr & SCTLR_EE)
+ *vcpu_cpsr(vcpu) |= PSR_E_BIT;
+
+ /* Note: These now point to the mode banked copies */
+ *vcpu_spsr(vcpu) = cpsr;
+}
+
/**
* kvm_inject_undefined - inject an undefined exception into the guest
* @vcpu: The VCPU to receive the undefined exception
@@ -286,29 +320,13 @@ static u32 exc_vector_base(struct kvm_vcpu *vcpu)
*/
void kvm_inject_undefined(struct kvm_vcpu *vcpu)
{
- unsigned long new_lr_value;
- unsigned long new_spsr_value;
unsigned long cpsr = *vcpu_cpsr(vcpu);
- u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
bool is_thumb = (cpsr & PSR_T_BIT);
u32 vect_offset = 4;
u32 return_offset = (is_thumb) ? 2 : 4;
- new_spsr_value = cpsr;
- new_lr_value = *vcpu_pc(vcpu) - return_offset;
-
- *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | UND_MODE;
- *vcpu_cpsr(vcpu) |= PSR_I_BIT;
- *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
-
- if (sctlr & SCTLR_TE)
- *vcpu_cpsr(vcpu) |= PSR_T_BIT;
- if (sctlr & SCTLR_EE)
- *vcpu_cpsr(vcpu) |= PSR_E_BIT;
-
- /* Note: These now point to UND banked copies */
- *vcpu_spsr(vcpu) = cpsr;
- *vcpu_reg(vcpu, 14) = new_lr_value;
+ kvm_update_psr(vcpu, UND_MODE);
+ *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) - return_offset;
/* Branch to exception vector */
*vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
@@ -320,30 +338,14 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
*/
static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr)
{
- unsigned long new_lr_value;
- unsigned long new_spsr_value;
unsigned long cpsr = *vcpu_cpsr(vcpu);
- u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
bool is_thumb = (cpsr & PSR_T_BIT);
u32 vect_offset;
u32 return_offset = (is_thumb) ? 4 : 0;
bool is_lpae;
- new_spsr_value = cpsr;
- new_lr_value = *vcpu_pc(vcpu) + return_offset;
-
- *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | ABT_MODE;
- *vcpu_cpsr(vcpu) |= PSR_I_BIT | PSR_A_BIT;
- *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
-
- if (sctlr & SCTLR_TE)
- *vcpu_cpsr(vcpu) |= PSR_T_BIT;
- if (sctlr & SCTLR_EE)
- *vcpu_cpsr(vcpu) |= PSR_E_BIT;
-
- /* Note: These now point to ABT banked copies */
- *vcpu_spsr(vcpu) = cpsr;
- *vcpu_reg(vcpu, 14) = new_lr_value;
+ kvm_update_psr(vcpu, ABT_MODE);
+ *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
if (is_pabt)
vect_offset = 12;
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 11b6595c2672..767872411d97 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -28,6 +28,7 @@
#include <asm/kvm_mmio.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
+#include <asm/virt.h>
#include "trace.h"
@@ -598,6 +599,9 @@ int create_hyp_mappings(void *from, void *to)
unsigned long start = KERN_TO_HYP((unsigned long)from);
unsigned long end = KERN_TO_HYP((unsigned long)to);
+ if (is_kernel_in_hyp_mode())
+ return 0;
+
start = start & PAGE_MASK;
end = PAGE_ALIGN(end);
@@ -630,6 +634,9 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
unsigned long start = KERN_TO_HYP((unsigned long)from);
unsigned long end = KERN_TO_HYP((unsigned long)to);
+ if (is_kernel_in_hyp_mode())
+ return 0;
+
/* Check for a valid kernel IO mapping */
if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
return -EINVAL;
@@ -656,9 +663,9 @@ static void *kvm_alloc_hwpgd(void)
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
* @kvm: The KVM struct pointer for the VM.
*
- * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
- * support either full 40-bit input addresses or limited to 32-bit input
- * addresses). Clears the allocated pages.
+ * Allocates only the stage-2 HW PGD level table(s) (can support either full
+ * 40-bit input addresses or limited to 32-bit input addresses). Clears the
+ * allocated pages.
*
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
@@ -1648,6 +1655,11 @@ phys_addr_t kvm_get_idmap_vector(void)
return hyp_idmap_vector;
}
+phys_addr_t kvm_get_idmap_start(void)
+{
+ return hyp_idmap_start;
+}
+
int kvm_mmu_init(void)
{
int err;
diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S
index b2b97e3e7bab..a62a7b64f49c 100644
--- a/arch/arm/vdso/vdso.S
+++ b/arch/arm/vdso/vdso.S
@@ -23,9 +23,8 @@
#include <linux/const.h>
#include <asm/page.h>
- __PAGE_ALIGNED_DATA
-
.globl vdso_start, vdso_end
+ .section .data..ro_after_init
.balign PAGE_SIZE
vdso_start:
.incbin "arch/arm/vdso/vdso.so"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 14cdc6dea493..3510b01acc8c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -13,6 +13,7 @@ config ARM64
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
select ARCH_WANT_FRAME_POINTERS
+ select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARM_AMBA
select ARM_ARCH_TIMER
select ARM_GIC
@@ -48,6 +49,8 @@ config ARM64
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_BITREVERSE
+ select HAVE_ARCH_HARDENED_USERCOPY
+ select HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
select HAVE_ARCH_KGDB
@@ -70,13 +73,17 @@ config ARM64
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_GENERIC_DMA_COHERENT
select HAVE_HW_BREAKPOINT if PERF_EVENTS
+ select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_MEMBLOCK
select HAVE_PATA_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RCU_TABLE_FREE
select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_KPROBES
+ select HAVE_KRETPROBES if HAVE_KPROBES
select IOMMU_DMA if IOMMU_SUPPORT
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
@@ -92,6 +99,7 @@ config ARM64
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select HAVE_CONTEXT_TRACKING
+ select HAVE_ARM_SMCCC
help
ARM 64-bit (AArch64) Linux support.
@@ -362,6 +370,7 @@ config ARM64_ERRATUM_843419
bool "Cortex-A53: 843419: A load or store might access an incorrect address"
depends on MODULES
default y
+ select ARM64_MODULE_CMODEL_LARGE
help
This option builds kernel modules using the large memory model in
order to avoid the use of the ADRP instruction, which can cause
@@ -526,6 +535,9 @@ config HOTPLUG_CPU
source kernel/Kconfig.preempt
source kernel/Kconfig.hz
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ def_bool y
+
config ARCH_HAS_HOLES_MEMORYMODEL
def_bool y if SPARSEMEM
@@ -549,9 +561,6 @@ config HW_PERF_EVENTS
config SYS_SUPPORTS_HUGETLBFS
def_bool y
-config ARCH_WANT_GENERAL_HUGETLB
- def_bool y
-
config ARCH_WANT_HUGE_PMD_SHARE
def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
@@ -724,10 +733,93 @@ config ARM64_LSE_ATOMICS
endmenu
+config ARM64_UAO
+ bool "Enable support for User Access Override (UAO)"
+ default y
+ help
+ User Access Override (UAO; part of the ARMv8.2 Extensions)
+ causes the 'unprivileged' variant of the load/store instructions to
+ be overriden to be privileged.
+
+ This option changes get_user() and friends to use the 'unprivileged'
+ variant of the load/store instructions. This ensures that user-space
+ really did have access to the supplied memory. When addr_limit is
+ set to kernel memory the UAO bit will be set, allowing privileged
+ access to kernel memory.
+
+ Choosing this option will cause copy_to_user() et al to use user-space
+ memory permissions.
+
+ The feature is detected at runtime, the kernel will use the
+ regular load/store instructions if the cpu does not implement the
+ feature.
+
+config ARM64_MODULE_CMODEL_LARGE
+ bool
+
+config ARM64_MODULE_PLTS
+ bool
+ select ARM64_MODULE_CMODEL_LARGE
+ select HAVE_MOD_ARCH_SPECIFIC
+
+config RELOCATABLE
+ bool
+ help
+ This builds the kernel as a Position Independent Executable (PIE),
+ which retains all relocation metadata required to relocate the
+ kernel binary at runtime to a different virtual address than the
+ address it was linked at.
+ Since AArch64 uses the RELA relocation format, this requires a
+ relocation pass at runtime even if the kernel is loaded at the
+ same address it was linked at.
+
+config RANDOMIZE_BASE
+ bool "Randomize the address of the kernel image"
+ select ARM64_MODULE_PLTS
+ select RELOCATABLE
+ help
+ Randomizes the virtual address at which the kernel image is
+ loaded, as a security feature that deters exploit attempts
+ relying on knowledge of the location of kernel internals.
+
+ It is the bootloader's job to provide entropy, by passing a
+ random u64 value in /chosen/kaslr-seed at kernel entry.
+
+ When booting via the UEFI stub, it will invoke the firmware's
+ EFI_RNG_PROTOCOL implementation (if available) to supply entropy
+ to the kernel proper. In addition, it will randomise the physical
+ location of the kernel Image as well.
+
+ If unsure, say N.
+
+config RANDOMIZE_MODULE_REGION_FULL
+ bool "Randomize the module region independently from the core kernel"
+ depends on RANDOMIZE_BASE
+ default y
+ help
+ Randomizes the location of the module region without considering the
+ location of the core kernel. This way, it is impossible for modules
+ to leak information about the location of core kernel data structures
+ but it does imply that function calls between modules and the core
+ kernel will need to be resolved via veneers in the module PLT.
+
+ When this option is not set, the module region will be randomized over
+ a limited range that contains the [_stext, _etext] interval of the
+ core kernel, so branch relocations are always in range.
+
endmenu
menu "Boot options"
+config ARM64_ACPI_PARKING_PROTOCOL
+ bool "Enable support for the ARM64 ACPI parking protocol"
+ depends on ACPI
+ help
+ Enable support for the ARM64 ACPI parking protocol. If disabled
+ the kernel will not allow booting through the ARM64 ACPI parking
+ protocol even if the corresponding data is present in the ACPI
+ MADT table.
+
config CMDLINE
string "Default kernel command string"
default ""
@@ -810,6 +902,14 @@ menu "Power management options"
source "kernel/power/Kconfig"
+config ARCH_HIBERNATION_POSSIBLE
+ def_bool y
+ depends on CPU_PM
+
+config ARCH_HIBERNATION_HEADER
+ def_bool y
+ depends on HIBERNATION
+
config ARCH_SUSPEND_POSSIBLE
def_bool y
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 04fb73b973f1..ab1cb1fc4e3d 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -73,7 +73,7 @@ config DEBUG_RODATA
If in doubt, say Y
config DEBUG_ALIGN_RODATA
- depends on DEBUG_RODATA && ARM64_4K_PAGES
+ depends on DEBUG_RODATA
bool "Align linker sections up to SECTION_SIZE"
help
If this option is enabled, sections that may potentially be marked as
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index b6c90e5006e4..0a9bf4500852 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -15,6 +15,10 @@ CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
GZFLAGS :=-9
+ifneq ($(CONFIG_RELOCATABLE),)
+LDFLAGS_vmlinux += -pie -Bsymbolic
+endif
+
KBUILD_DEFCONFIG := defconfig
# Check for binutils support for specific extensions
@@ -28,6 +32,7 @@ endif
KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr)
KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads)
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_AFLAGS += $(lseinstr)
ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
@@ -42,10 +47,14 @@ endif
CHECKFLAGS += -D__aarch64__
-ifeq ($(CONFIG_ARM64_ERRATUM_843419), y)
+ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y)
KBUILD_CFLAGS_MODULE += -mcmodel=large
endif
+ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
+KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/arm64/kernel/module.lds
+endif
+
# Default value
head-y := arch/arm64/kernel/head.o
diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts
index 53442b5ee4ff..3e1a84b01b50 100644
--- a/arch/arm64/boot/dts/arm/juno.dts
+++ b/arch/arm64/boot/dts/arm/juno.dts
@@ -143,5 +143,310 @@
<&A53_3>;
};
+ etr@20070000 {
+ compatible = "arm,coresight-tmc", "arm,primecell";
+ reg = <0 0x20070000 0 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ etr_in_port: endpoint {
+ slave-mode;
+ remote-endpoint = <&replicator_out_port1>;
+ };
+ };
+ };
+
+ tpiu@20030000 {
+ compatible = "arm,coresight-tpiu", "arm,primecell";
+ reg = <0 0x20030000 0 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ tpiu_in_port: endpoint {
+ slave-mode;
+ remote-endpoint = <&replicator_out_port0>;
+ };
+ };
+ };
+
+ replicator@20020000 {
+ /* non-configurable replicators don't show up on the
+ * AMBA bus. As such no need to add "arm,primecell".
+ */
+ compatible = "arm,coresight-replicator";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* replicator output ports */
+ port@0 {
+ reg = <0>;
+ replicator_out_port0: endpoint {
+ remote-endpoint = <&tpiu_in_port>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ replicator_out_port1: endpoint {
+ remote-endpoint = <&etr_in_port>;
+ };
+ };
+
+ /* replicator input port */
+ port@2 {
+ reg = <0>;
+ replicator_in_port0: endpoint {
+ slave-mode;
+ remote-endpoint = <&etf_out_port>;
+ };
+ };
+ };
+ };
+
+ etf@20010000 {
+ compatible = "arm,coresight-tmc", "arm,primecell";
+ reg = <0 0x20010000 0 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* input port */
+ port@0 {
+ reg = <0>;
+ etf_in_port: endpoint {
+ slave-mode;
+ remote-endpoint =
+ <&main_funnel_out_port>;
+ };
+ };
+
+ /* output port */
+ port@1 {
+ reg = <0>;
+ etf_out_port: endpoint {
+ remote-endpoint =
+ <&replicator_in_port0>;
+ };
+ };
+ };
+ };
+
+ main_funnel@20040000 {
+ compatible = "arm,coresight-funnel", "arm,primecell";
+ reg = <0 0x20040000 0 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ main_funnel_out_port: endpoint {
+ remote-endpoint =
+ <&etf_in_port>;
+ };
+ };
+
+ port@1 {
+ reg = <0>;
+ main_funnel_in_port0: endpoint {
+ slave-mode;
+ remote-endpoint =
+ <&A72_57_funnel_out_port>;
+ };
+ };
+
+ port@2 {
+ reg = <1>;
+ main_funnel_in_port1: endpoint {
+ slave-mode;
+ remote-endpoint = <&A53_funnel_out_port>;
+ };
+ };
+
+ };
+ };
+
+ A72_57_funnel@220c0000 {
+ compatible = "arm,coresight-funnel", "arm,primecell";
+ reg = <0 0x220c0000 0 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ A72_57_funnel_out_port: endpoint {
+ remote-endpoint =
+ <&main_funnel_in_port0>;
+ };
+ };
+
+ port@1 {
+ reg = <0>;
+ A72_57_funnel_in_port0: endpoint {
+ slave-mode;
+ remote-endpoint =
+ <&A72_57_etm0_out_port>;
+ };
+ };
+
+ port@2 {
+ reg = <1>;
+ A72_57_funnel_in_port1: endpoint {
+ slave-mode;
+ remote-endpoint =
+ <&A72_57_etm1_out_port>;
+ };
+ };
+ };
+ };
+
+ A53_funnel@220c0000 {
+ compatible = "arm,coresight-funnel", "arm,primecell";
+ reg = <0 0x230c0000 0 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ A53_funnel_out_port: endpoint {
+ remote-endpoint =
+ <&main_funnel_in_port1>;
+ };
+ };
+
+ port@1 {
+ reg = <0>;
+ A53_funnel_in_port0: endpoint {
+ slave-mode;
+ remote-endpoint = <&A53_etm0_out_port>;
+ };
+ };
+
+ port@2 {
+ reg = <1>;
+ A53_funnel_in_port1: endpoint {
+ slave-mode;
+ remote-endpoint = <&A53_etm1_out_port>;
+ };
+ };
+ port@3 {
+ reg = <2>;
+ A53_funnel_in_port2: endpoint {
+ slave-mode;
+ remote-endpoint = <&A53_etm2_out_port>;
+ };
+ };
+ port@4 {
+ reg = <3>;
+ A53_funnel_in_port3: endpoint {
+ slave-mode;
+ remote-endpoint = <&A53_etm3_out_port>;
+ };
+ };
+ };
+ };
+
+ etm@22040000 {
+ compatible = "arm,coresight-etm4x", "arm,primecell";
+ reg = <0 0x22040000 0 0x1000>;
+
+ cpu = <&A57_0>;
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ A72_57_etm0_out_port: endpoint {
+ remote-endpoint = <&A72_57_funnel_in_port0>;
+ };
+ };
+ };
+
+ etm@22140000 {
+ compatible = "arm,coresight-etm4x", "arm,primecell";
+ reg = <0 0x22140000 0 0x1000>;
+
+ cpu = <&A57_1>;
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ A72_57_etm1_out_port: endpoint {
+ remote-endpoint = <&A72_57_funnel_in_port1>;
+ };
+ };
+ };
+
+ etm@23040000 {
+ compatible = "arm,coresight-etm4x", "arm,primecell";
+ reg = <0 0x23040000 0 0x1000>;
+
+ cpu = <&A53_0>;
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ A53_etm0_out_port: endpoint {
+ remote-endpoint = <&A53_funnel_in_port0>;
+ };
+ };
+ };
+
+ etm@23140000 {
+ compatible = "arm,coresight-etm4x", "arm,primecell";
+ reg = <0 0x23140000 0 0x1000>;
+
+ cpu = <&A53_1>;
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ A53_etm1_out_port: endpoint {
+ remote-endpoint = <&A53_funnel_in_port1>;
+ };
+ };
+ };
+
+ etm@23240000 {
+ compatible = "arm,coresight-etm4x", "arm,primecell";
+ reg = <0 0x23240000 0 0x1000>;
+
+ cpu = <&A53_2>;
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ A53_etm2_out_port: endpoint {
+ remote-endpoint = <&A53_funnel_in_port2>;
+ };
+ };
+ };
+
+ etm@23340000 {
+ compatible = "arm,coresight-etm4x", "arm,primecell";
+ reg = <0 0x23340000 0 0x1000>;
+
+ cpu = <&A53_3>;
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ port {
+ A53_etm3_out_port: endpoint {
+ remote-endpoint = <&A53_funnel_in_port3>;
+ };
+ };
+ };
+
#include "juno-base.dtsi"
};
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index bdd7aa358d2a..79717faf2161 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -225,3 +225,6 @@ CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y
CONFIG_CRYPTO_CRC32_ARM64=y
+CONFIG_HIBERNATION=y
+CONFIG_KPROBES=y
+CONFIG_CORESIGHT=y
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index caafd63b8092..aee323b13802 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -87,9 +87,26 @@ void __init acpi_init_cpus(void);
static inline void acpi_init_cpus(void) { }
#endif /* CONFIG_ACPI */
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+bool acpi_parking_protocol_valid(int cpu);
+void __init
+acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor);
+#else
+static inline bool acpi_parking_protocol_valid(int cpu) { return false; }
+static inline void
+acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor)
+{}
+#endif
+
static inline const char *acpi_get_enable_method(int cpu)
{
- return acpi_psci_present() ? "psci" : NULL;
+ if (acpi_psci_present())
+ return "psci";
+
+ if (acpi_parking_protocol_valid(cpu))
+ return "parking-protocol";
+
+ return NULL;
}
#ifdef CONFIG_ACPI_APEI
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index d56ec0715157..beccbdefa106 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -1,6 +1,8 @@
#ifndef __ASM_ALTERNATIVE_H
#define __ASM_ALTERNATIVE_H
+#include <asm/cpufeature.h>
+
#ifndef __ASSEMBLY__
#include <linux/init.h>
@@ -19,7 +21,6 @@ struct alt_instr {
void __init apply_alternatives_all(void);
void apply_alternatives(void *start, size_t length);
-void free_alternatives_memory(void);
#define ALTINSTR_ENTRY(feature) \
" .word 661b - .\n" /* label */ \
@@ -64,6 +65,8 @@ void free_alternatives_memory(void);
#else
+#include <asm/assembler.h>
+
.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
.word \orig_offset - .
.word \alt_offset - .
@@ -137,6 +140,65 @@ void free_alternatives_memory(void);
alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
+/*
+ * Generate the assembly for UAO alternatives with exception table entries.
+ * This is complicated as there is no post-increment or pair versions of the
+ * unprivileged instructions, and USER() only works for single instructions.
+ */
+#ifdef CONFIG_ARM64_UAO
+ .macro uao_ldp l, reg1, reg2, addr, post_inc
+ alternative_if_not ARM64_HAS_UAO
+8888: ldp \reg1, \reg2, [\addr], \post_inc;
+8889: nop;
+ nop;
+ alternative_else
+ ldtr \reg1, [\addr];
+ ldtr \reg2, [\addr, #8];
+ add \addr, \addr, \post_inc;
+ alternative_endif
+
+ _asm_extable 8888b,\l;
+ _asm_extable 8889b,\l;
+ .endm
+
+ .macro uao_stp l, reg1, reg2, addr, post_inc
+ alternative_if_not ARM64_HAS_UAO
+8888: stp \reg1, \reg2, [\addr], \post_inc;
+8889: nop;
+ nop;
+ alternative_else
+ sttr \reg1, [\addr];
+ sttr \reg2, [\addr, #8];
+ add \addr, \addr, \post_inc;
+ alternative_endif
+
+ _asm_extable 8888b,\l;
+ _asm_extable 8889b,\l;
+ .endm
+
+ .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+ alternative_if_not ARM64_HAS_UAO
+8888: \inst \reg, [\addr], \post_inc;
+ nop;
+ alternative_else
+ \alt_inst \reg, [\addr];
+ add \addr, \addr, \post_inc;
+ alternative_endif
+
+ _asm_extable 8888b,\l;
+ .endm
+#else
+ .macro uao_ldp l, reg1, reg2, addr, post_inc
+ USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
+ .endm
+ .macro uao_stp l, reg1, reg2, addr, post_inc
+ USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
+ .endm
+ .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+ USER(\l, \inst \reg, [\addr], \post_inc)
+ .endm
+#endif
+
#endif /* __ASSEMBLY__ */
/*
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 12eff928ef8b..290e13428f4a 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -1,5 +1,5 @@
/*
- * Based on arch/arm/include/asm/assembler.h
+ * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S
*
* Copyright (C) 1996-2000 Russell King
* Copyright (C) 2012 ARM Ltd.
@@ -23,6 +23,9 @@
#ifndef __ASM_ASSEMBLER_H
#define __ASM_ASSEMBLER_H
+#include <asm/asm-offsets.h>
+#include <asm/page.h>
+#include <asm/pgtable-hwdef.h>
#include <asm/ptrace.h>
#include <asm/thread_info.h>
@@ -94,12 +97,19 @@
dmb \opt
.endm
+/*
+ * Emit an entry into the exception table
+ */
+ .macro _asm_extable, from, to
+ .pushsection __ex_table, "a"
+ .align 3
+ .long (\from - .), (\to - .)
+ .popsection
+ .endm
+
#define USER(l, x...) \
9999: x; \
- .section __ex_table,"a"; \
- .align 3; \
- .quad 9999b,l; \
- .previous
+ _asm_extable 9999b, l
/*
* Register aliases.
@@ -193,6 +203,113 @@ lr .req x30 // link register
str \src, [\tmp, :lo12:\sym]
.endm
+ /*
+ * @sym: The name of the per-cpu variable
+ * @reg: Result of per_cpu(sym, smp_processor_id())
+ * @tmp: scratch register
+ */
+ .macro this_cpu_ptr, sym, reg, tmp
+ adr_l \reg, \sym
+ mrs \tmp, tpidr_el1
+ add \reg, \reg, \tmp
+ .endm
+
+/*
+ * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
+ */
+ .macro vma_vm_mm, rd, rn
+ ldr \rd, [\rn, #VMA_VM_MM]
+ .endm
+
+/*
+ * mmid - get context id from mm pointer (mm->context.id)
+ */
+ .macro mmid, rd, rn
+ ldr \rd, [\rn, #MM_CONTEXT_ID]
+ .endm
+
+/*
+ * dcache_line_size - get the minimum D-cache line size from the CTR register.
+ */
+ .macro dcache_line_size, reg, tmp
+ mrs \tmp, ctr_el0 // read CTR
+ ubfm \tmp, \tmp, #16, #19 // cache line size encoding
+ mov \reg, #4 // bytes per word
+ lsl \reg, \reg, \tmp // actual cache line size
+ .endm
+
+/*
+ * icache_line_size - get the minimum I-cache line size from the CTR register.
+ */
+ .macro icache_line_size, reg, tmp
+ mrs \tmp, ctr_el0 // read CTR
+ and \tmp, \tmp, #0xf // cache line size encoding
+ mov \reg, #4 // bytes per word
+ lsl \reg, \reg, \tmp // actual cache line size
+ .endm
+
+/*
+ * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
+ */
+ .macro tcr_set_idmap_t0sz, valreg, tmpreg
+#ifndef CONFIG_ARM64_VA_BITS_48
+ ldr_l \tmpreg, idmap_t0sz
+ bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
+#endif
+ .endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ * op: operation passed to dc instruction
+ * domain: domain used in dsb instruciton
+ * kaddr: starting virtual address of the region
+ * size: size of the region
+ * Corrupts: kaddr, size, tmp1, tmp2
+ */
+ .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+ dcache_line_size \tmp1, \tmp2
+ add \size, \kaddr, \size
+ sub \tmp2, \tmp1, #1
+ bic \kaddr, \kaddr, \tmp2
+9998: dc \op, \kaddr
+ add \kaddr, \kaddr, \tmp1
+ cmp \kaddr, \size
+ b.lo 9998b
+ dsb \domain
+ .endm
+
+/*
+ * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
+ */
+ .macro reset_pmuserenr_el0, tmpreg
+ mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer
+ sbfx \tmpreg, \tmpreg, #8, #4
+ cmp \tmpreg, #1 // Skip if no PMU present
+ b.lt 9000f
+ msr pmuserenr_el0, xzr // Disable PMU access from EL0
+9000:
+ .endm
+
+/*
+ * copy_page - copy src to dest using temp registers t1-t8
+ */
+ .macro copy_page dest:req src:req t1:req t2:req t3:req t4:req t5:req t6:req t7:req t8:req
+9998: ldp \t1, \t2, [\src]
+ ldp \t3, \t4, [\src, #16]
+ ldp \t5, \t6, [\src, #32]
+ ldp \t7, \t8, [\src, #48]
+ add \src, \src, #64
+ stnp \t1, \t2, [\dest]
+ stnp \t3, \t4, [\dest, #16]
+ stnp \t5, \t6, [\dest, #32]
+ stnp \t7, \t8, [\dest, #48]
+ add \dest, \dest, #64
+ tst \src, #(PAGE_SIZE - 1)
+ b.ne 9998b
+ .endm
+
/*
* Annotate a function as position independent, i.e., safe to be called before
* the kernel virtual mapping is activated.
@@ -204,4 +321,35 @@ lr .req x30 // link register
.size __pi_##x, . - x; \
ENDPROC(x)
+ /*
+ * Emit a 64-bit absolute little endian symbol reference in a way that
+ * ensures that it will be resolved at build time, even when building a
+ * PIE binary. This requires cooperation from the linker script, which
+ * must emit the lo32/hi32 halves individually.
+ */
+ .macro le64sym, sym
+ .long \sym\()_lo32
+ .long \sym\()_hi32
+ .endm
+
+ /*
+ * mov_q - move an immediate constant into a 64-bit register using
+ * between 2 and 4 movz/movk instructions (depending on the
+ * magnitude and sign of the operand)
+ */
+ .macro mov_q, reg, val
+ .if (((\val) >> 31) == 0 || ((\val) >> 31) == 0x1ffffffff)
+ movz \reg, :abs_g1_s:\val
+ .else
+ .if (((\val) >> 47) == 0 || ((\val) >> 47) == 0x1ffff)
+ movz \reg, :abs_g2_s:\val
+ .else
+ movz \reg, :abs_g3:\val
+ movk \reg, :abs_g2_nc:\val
+ .endif
+ movk \reg, :abs_g1_nc:\val
+ .endif
+ movk \reg, :abs_g0_nc:\val
+ .endm
+
#endif /* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 197e06afbf71..39c1d340fec5 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -36,7 +36,7 @@ static inline void atomic_andnot(int i, atomic_t *v)
" stclr %w[i], %[v]\n")
: [i] "+r" (w0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic_or(int i, atomic_t *v)
@@ -48,7 +48,7 @@ static inline void atomic_or(int i, atomic_t *v)
" stset %w[i], %[v]\n")
: [i] "+r" (w0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic_xor(int i, atomic_t *v)
@@ -60,7 +60,7 @@ static inline void atomic_xor(int i, atomic_t *v)
" steor %w[i], %[v]\n")
: [i] "+r" (w0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic_add(int i, atomic_t *v)
@@ -72,7 +72,7 @@ static inline void atomic_add(int i, atomic_t *v)
" stadd %w[i], %[v]\n")
: [i] "+r" (w0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
#define ATOMIC_OP_ADD_RETURN(name, mb, cl...) \
@@ -90,7 +90,7 @@ static inline int atomic_add_return##name(int i, atomic_t *v) \
" add %w[i], %w[i], w30") \
: [i] "+r" (w0), [v] "+Q" (v->counter) \
: "r" (x1) \
- : "x30" , ##cl); \
+ : __LL_SC_CLOBBERS, ##cl); \
\
return w0; \
}
@@ -116,7 +116,7 @@ static inline void atomic_and(int i, atomic_t *v)
" stclr %w[i], %[v]")
: [i] "+r" (w0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic_sub(int i, atomic_t *v)
@@ -133,7 +133,7 @@ static inline void atomic_sub(int i, atomic_t *v)
" stadd %w[i], %[v]")
: [i] "+r" (w0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
#define ATOMIC_OP_SUB_RETURN(name, mb, cl...) \
@@ -153,7 +153,7 @@ static inline int atomic_sub_return##name(int i, atomic_t *v) \
" add %w[i], %w[i], w30") \
: [i] "+r" (w0), [v] "+Q" (v->counter) \
: "r" (x1) \
- : "x30" , ##cl); \
+ : __LL_SC_CLOBBERS , ##cl); \
\
return w0; \
}
@@ -177,7 +177,7 @@ static inline void atomic64_andnot(long i, atomic64_t *v)
" stclr %[i], %[v]\n")
: [i] "+r" (x0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic64_or(long i, atomic64_t *v)
@@ -189,7 +189,7 @@ static inline void atomic64_or(long i, atomic64_t *v)
" stset %[i], %[v]\n")
: [i] "+r" (x0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic64_xor(long i, atomic64_t *v)
@@ -201,7 +201,7 @@ static inline void atomic64_xor(long i, atomic64_t *v)
" steor %[i], %[v]\n")
: [i] "+r" (x0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic64_add(long i, atomic64_t *v)
@@ -213,7 +213,7 @@ static inline void atomic64_add(long i, atomic64_t *v)
" stadd %[i], %[v]\n")
: [i] "+r" (x0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
#define ATOMIC64_OP_ADD_RETURN(name, mb, cl...) \
@@ -231,7 +231,7 @@ static inline long atomic64_add_return##name(long i, atomic64_t *v) \
" add %[i], %[i], x30") \
: [i] "+r" (x0), [v] "+Q" (v->counter) \
: "r" (x1) \
- : "x30" , ##cl); \
+ : __LL_SC_CLOBBERS, ##cl); \
\
return x0; \
}
@@ -257,7 +257,7 @@ static inline void atomic64_and(long i, atomic64_t *v)
" stclr %[i], %[v]")
: [i] "+r" (x0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
static inline void atomic64_sub(long i, atomic64_t *v)
@@ -274,7 +274,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
" stadd %[i], %[v]")
: [i] "+r" (x0), [v] "+Q" (v->counter)
: "r" (x1)
- : "x30");
+ : __LL_SC_CLOBBERS);
}
#define ATOMIC64_OP_SUB_RETURN(name, mb, cl...) \
@@ -294,7 +294,7 @@ static inline long atomic64_sub_return##name(long i, atomic64_t *v) \
" add %[i], %[i], x30") \
: [i] "+r" (x0), [v] "+Q" (v->counter) \
: "r" (x1) \
- : "x30" , ##cl); \
+ : __LL_SC_CLOBBERS, ##cl); \
\
return x0; \
}
@@ -330,7 +330,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
"2:")
: [ret] "+&r" (x0), [v] "+Q" (v->counter)
:
- : "x30", "cc", "memory");
+ : __LL_SC_CLOBBERS, "cc", "memory");
return x0;
}
@@ -359,7 +359,7 @@ static inline unsigned long __cmpxchg_case_##name(volatile void *ptr, \
" mov %" #w "[ret], " #w "30") \
: [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr) \
: [old] "r" (x1), [new] "r" (x2) \
- : "x30" , ##cl); \
+ : __LL_SC_CLOBBERS, ##cl); \
\
return x0; \
}
@@ -416,7 +416,7 @@ static inline long __cmpxchg_double##name(unsigned long old1, \
[v] "+Q" (*(unsigned long *)ptr) \
: [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \
[oldval1] "r" (oldval1), [oldval2] "r" (oldval2) \
- : "x30" , ##cl); \
+ : __LL_SC_CLOBBERS, ##cl); \
\
return x0; \
}
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
index 81151b67b26b..ebf2481889c3 100644
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -11,4 +11,10 @@
#define MIN_FDT_ALIGN 8
#define MAX_FDT_SIZE SZ_2M
+/*
+ * arm64 requires the kernel image to placed
+ * TEXT_OFFSET bytes beyond a 2 MB aligned base
+ */
+#define MIN_KIMG_ALIGN SZ_2M
+
#endif
diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h
new file mode 100644
index 000000000000..ed693c5bcec0
--- /dev/null
+++ b/arch/arm64/include/asm/brk-imm.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_BRK_IMM_H
+#define __ASM_BRK_IMM_H
+
+/*
+ * #imm16 values used for BRK instruction generation
+ * Allowed values for kgdb are 0x400 - 0x7ff
+ * 0x100: for triggering a fault on purpose (reserved)
+ * 0x400: for dynamic BRK instruction
+ * 0x401: for compile time BRK instruction
+ * 0x800: kernel-mode BUG() and WARN() traps
+ */
+#define FAULT_BRK_IMM 0x100
+#define KGDB_DYN_DBG_BRK_IMM 0x400
+#define KGDB_COMPILED_DBG_BRK_IMM 0x401
+#define BUG_BRK_IMM 0x800
+
+#endif
diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 4a748ce9ba1a..561190d15881 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -18,7 +18,7 @@
#ifndef _ARCH_ARM64_ASM_BUG_H
#define _ARCH_ARM64_ASM_BUG_H
-#include <asm/debug-monitors.h>
+#include <asm/brk-imm.h>
#ifdef CONFIG_GENERIC_BUG
#define HAVE_ARCH_BUG
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 54efedaf331f..22dda613f9c9 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -68,6 +68,7 @@
extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
extern void flush_icache_range(unsigned long start, unsigned long end);
extern void __flush_dcache_area(void *addr, size_t len);
+extern void __clean_dcache_area_pou(void *addr, size_t len);
extern long __flush_cache_user_range(unsigned long start, unsigned long end);
static inline void flush_cache_mm(struct mm_struct *mm)
@@ -155,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
#endif
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 9ea611ea69df..510c7b404454 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -19,7 +19,6 @@
#define __ASM_CMPXCHG_H
#include <linux/bug.h>
-#include <linux/mmdebug.h>
#include <asm/atomic.h>
#include <asm/barrier.h>
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index b5e9cee4b5f8..13a6103130cd 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -36,6 +36,7 @@ struct cpuinfo_arm64 {
u64 reg_id_aa64isar1;
u64 reg_id_aa64mmfr0;
u64 reg_id_aa64mmfr1;
+ u64 reg_id_aa64mmfr2;
u64 reg_id_aa64pfr0;
u64 reg_id_aa64pfr1;
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8884b5d5f48c..1695f77d8bf2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -30,9 +30,13 @@
#define ARM64_HAS_LSE_ATOMICS 5
#define ARM64_WORKAROUND_CAVIUM_23154 6
#define ARM64_WORKAROUND_834220 7
-#define ARM64_WORKAROUND_CAVIUM_27456 8
+#define ARM64_HAS_NO_HW_PREFETCH 8
+#define ARM64_HAS_UAO 9
+#define ARM64_ALT_PAN_NOT_UAO 10
-#define ARM64_NCAPS 9
+#define ARM64_WORKAROUND_CAVIUM_27456 11
+#define ARM64_HAS_VIRT_HOST_EXTN 12
+#define ARM64_NCAPS 13
#ifndef __ASSEMBLY__
@@ -177,7 +181,7 @@ u64 read_system_reg(u32 id);
static inline bool cpu_supports_mixed_endian_el0(void)
{
- return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
+ return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1));
}
static inline bool system_supports_mixed_endian_el0(void)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 1a5949364ed0..b3a83da152a7 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -32,12 +32,6 @@
#define MPIDR_AFFINITY_LEVEL(mpidr, level) \
((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)
-#define read_cpuid(reg) ({ \
- u64 __val; \
- asm("mrs %0, " #reg : "=r" (__val)); \
- __val; \
-})
-
#define MIDR_REVISION_MASK 0xf
#define MIDR_REVISION(midr) ((midr) & MIDR_REVISION_MASK)
#define MIDR_PARTNUM_SHIFT 4
@@ -57,11 +51,22 @@
#define MIDR_IMPLEMENTOR(midr) \
(((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT)
-#define MIDR_CPU_PART(imp, partnum) \
+#define MIDR_CPU_MODEL(imp, partnum) \
(((imp) << MIDR_IMPLEMENTOR_SHIFT) | \
(0xf << MIDR_ARCHITECTURE_SHIFT) | \
((partnum) << MIDR_PARTNUM_SHIFT))
+#define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \
+ MIDR_ARCHITECTURE_MASK)
+
+#define MIDR_IS_CPU_MODEL_RANGE(midr, model, rv_min, rv_max) \
+({ \
+ u32 _model = (midr) & MIDR_CPU_MODEL_MASK; \
+ u32 rv = (midr) & (MIDR_REVISION_MASK | MIDR_VARIANT_MASK); \
+ \
+ _model == (model) && rv >= (rv_min) && rv <= (rv_max); \
+ })
+
#define ARM_CPU_IMP_ARM 0x41
#define ARM_CPU_IMP_APM 0x50
#define ARM_CPU_IMP_CAVIUM 0x43
@@ -75,8 +80,20 @@
#define CAVIUM_CPU_PART_THUNDERX 0x0A1
+#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
+#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
+#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
+
#ifndef __ASSEMBLY__
+#include <asm/sysreg.h>
+
+#define read_cpuid(reg) ({ \
+ u64 __val; \
+ asm("mrs_s %0, " __stringify(reg) : "=r" (__val)); \
+ __val; \
+})
+
/*
* The CPU ID never changes at run time, so we might as well tell the
* compiler that it's constant. Use this function to read the CPU ID
@@ -84,12 +101,12 @@
*/
static inline u32 __attribute_const__ read_cpuid_id(void)
{
- return read_cpuid(MIDR_EL1);
+ return read_cpuid(SYS_MIDR_EL1);
}
static inline u64 __attribute_const__ read_cpuid_mpidr(void)
{
- return read_cpuid(MPIDR_EL1);
+ return read_cpuid(SYS_MPIDR_EL1);
}
static inline unsigned int __attribute_const__ read_cpuid_implementor(void)
@@ -104,7 +121,7 @@ static inline unsigned int __attribute_const__ read_cpuid_part_number(void)
static inline u32 __attribute_const__ read_cpuid_cachetype(void)
{
- return read_cpuid(CTR_EL0);
+ return read_cpuid(SYS_CTR_EL0);
}
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index 279c85b5ec09..4b6b3f72a215 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -20,6 +20,7 @@
#include <linux/errno.h>
#include <linux/types.h>
+#include <asm/brk-imm.h>
#include <asm/esr.h>
#include <asm/insn.h>
#include <asm/ptrace.h>
@@ -47,19 +48,6 @@
#define BREAK_INSTR_SIZE AARCH64_INSN_SIZE
/*
- * #imm16 values used for BRK instruction generation
- * Allowed values for kgbd are 0x400 - 0x7ff
- * 0x100: for triggering a fault on purpose (reserved)
- * 0x400: for dynamic BRK instruction
- * 0x401: for compile time BRK instruction
- * 0x800: kernel-mode BUG() and WARN() traps
- */
-#define FAULT_BRK_IMM 0x100
-#define KGDB_DYN_DBG_BRK_IMM 0x400
-#define KGDB_COMPILED_DBG_BRK_IMM 0x401
-#define BUG_BRK_IMM 0x800
-
-/*
* BRK instruction encoding
* The #imm16 value should be placed at bits[20:5] within BRK ins
*/
@@ -78,6 +66,11 @@
#define CACHE_FLUSH_IS_SAFE 1
+/* kprobes BRK opcodes with ESR encoding */
+#define BRK64_ESR_MASK 0xFFFF
+#define BRK64_ESR_KPROBES 0x0004
+#define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5))
+
/* AArch32 */
#define DBG_ESR_EVT_BKPT 0x4
#define DBG_ESR_EVT_VECC 0x5
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 44dd892a4bbe..7875c886ad24 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -24,15 +24,6 @@
#include <asm/ptrace.h>
#include <asm/user.h>
-typedef unsigned long elf_greg_t;
-
-#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
-#define ELF_CORE_COPY_REGS(dest, regs) \
- *(struct user_pt_regs *)&(dest) = (regs)->user_regs;
-
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-typedef struct user_fpsimd_state elf_fpregset_t;
-
/*
* AArch64 static relocation types.
*/
@@ -86,6 +77,8 @@ typedef struct user_fpsimd_state elf_fpregset_t;
#define R_AARCH64_MOVW_PREL_G2_NC 292
#define R_AARCH64_MOVW_PREL_G3 293
+#define R_AARCH64_RELATIVE 1027
+
/*
* These are used to set parameters in the core dumps.
*/
@@ -127,6 +120,17 @@ typedef struct user_fpsimd_state elf_fpregset_t;
*/
#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
+#ifndef __ASSEMBLY__
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
+#define ELF_CORE_COPY_REGS(dest, regs) \
+ *(struct user_pt_regs *)&(dest) = (regs)->user_regs;
+
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+typedef struct user_fpsimd_state elf_fpregset_t;
+
/*
* When the program starts, a1 contains a pointer to a function to be
* registered with atexit, as per the SVR4 ABI. A value of 0 means we have no
@@ -187,4 +191,6 @@ extern int aarch32_setup_vectors_page(struct linux_binprm *bprm,
#endif /* CONFIG_COMPAT */
+#endif /* !__ASSEMBLY__ */
+
#endif
diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 309704544d22..1a617d46fce9 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -62,6 +62,16 @@ enum fixed_addresses {
FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+
+ /*
+ * Used for kernel page table creation, so unmapped memory may be used
+ * for tables.
+ */
+ FIX_PTE,
+ FIX_PMD,
+ FIX_PUD,
+ FIX_PGD,
+
__end_of_fixed_addresses
};
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index c5534facf941..3c60f37e48ab 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -28,6 +28,8 @@ struct dyn_arch_ftrace {
extern unsigned long ftrace_graph_call;
+extern void return_to_handler(void);
+
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
/*
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 5f3ab8c1db55..f2585cdd32c2 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -42,10 +42,8 @@
"4: mov %w0, %w5\n" \
" b 3b\n" \
" .popsection\n" \
-" .pushsection __ex_table,\"a\"\n" \
-" .align 3\n" \
-" .quad 1b, 4b, 2b, 4b\n" \
-" .popsection\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
CONFIG_ARM64_PAN) \
: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \
@@ -134,10 +132,8 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
"4: mov %w0, %w6\n"
" b 3b\n"
" .popsection\n"
-" .pushsection __ex_table,\"a\"\n"
-" .align 3\n"
-" .quad 1b, 4b, 2b, 4b\n"
-" .popsection\n"
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 4b)
ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index a57601f9d17c..8740297dac77 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -20,7 +20,7 @@
#include <linux/threads.h>
#include <asm/irq.h>
-#define NR_IPI 5
+#define NR_IPI 6
typedef struct {
unsigned int __softirq_pending;
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index bb4052e85dba..bbc1e35aa601 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- ptep_clear_flush(vma, addr, ptep);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- ptep_set_wrprotect(mm, addr, ptep);
-}
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- return ptep_get_and_clear(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page)
clear_bit(PG_dcache_clean, &page->flags);
}
+extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+ struct page *page, int writable);
+#define arch_make_huge_pte arch_make_huge_pte
+extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty);
+extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);
+extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);
+extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 30e50eb54a67..1dbaa901d7e5 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -120,6 +120,29 @@ enum aarch64_insn_register {
AARCH64_INSN_REG_SP = 31 /* Stack pointer: as load/store base reg */
};
+enum aarch64_insn_special_register {
+ AARCH64_INSN_SPCLREG_SPSR_EL1 = 0xC200,
+ AARCH64_INSN_SPCLREG_ELR_EL1 = 0xC201,
+ AARCH64_INSN_SPCLREG_SP_EL0 = 0xC208,
+ AARCH64_INSN_SPCLREG_SPSEL = 0xC210,
+ AARCH64_INSN_SPCLREG_CURRENTEL = 0xC212,
+ AARCH64_INSN_SPCLREG_DAIF = 0xDA11,
+ AARCH64_INSN_SPCLREG_NZCV = 0xDA10,
+ AARCH64_INSN_SPCLREG_FPCR = 0xDA20,
+ AARCH64_INSN_SPCLREG_DSPSR_EL0 = 0xDA28,
+ AARCH64_INSN_SPCLREG_DLR_EL0 = 0xDA29,
+ AARCH64_INSN_SPCLREG_SPSR_EL2 = 0xE200,
+ AARCH64_INSN_SPCLREG_ELR_EL2 = 0xE201,
+ AARCH64_INSN_SPCLREG_SP_EL1 = 0xE208,
+ AARCH64_INSN_SPCLREG_SPSR_INQ = 0xE218,
+ AARCH64_INSN_SPCLREG_SPSR_ABT = 0xE219,
+ AARCH64_INSN_SPCLREG_SPSR_UND = 0xE21A,
+ AARCH64_INSN_SPCLREG_SPSR_FIQ = 0xE21B,
+ AARCH64_INSN_SPCLREG_SPSR_EL3 = 0xF200,
+ AARCH64_INSN_SPCLREG_ELR_EL3 = 0xF201,
+ AARCH64_INSN_SPCLREG_SP_EL2 = 0xF210
+};
+
enum aarch64_insn_variant {
AARCH64_INSN_VARIANT_32BIT,
AARCH64_INSN_VARIANT_64BIT
@@ -223,8 +246,15 @@ static __always_inline bool aarch64_insn_is_##abbr(u32 code) \
static __always_inline u32 aarch64_insn_get_##abbr##_value(void) \
{ return (val); }
+__AARCH64_INSN_FUNCS(adr_adrp, 0x1F000000, 0x10000000)
+__AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000)
__AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800)
__AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800)
+__AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000)
+__AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000)
+__AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000)
+__AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000)
+__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000)
__AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000)
__AARCH64_INSN_FUNCS(ldp_post, 0x7FC00000, 0x28C00000)
__AARCH64_INSN_FUNCS(stp_pre, 0x7FC00000, 0x29800000)
@@ -273,10 +303,15 @@ __AARCH64_INSN_FUNCS(svc, 0xFFE0001F, 0xD4000001)
__AARCH64_INSN_FUNCS(hvc, 0xFFE0001F, 0xD4000002)
__AARCH64_INSN_FUNCS(smc, 0xFFE0001F, 0xD4000003)
__AARCH64_INSN_FUNCS(brk, 0xFFE0001F, 0xD4200000)
+__AARCH64_INSN_FUNCS(exception, 0xFF000000, 0xD4000000)
__AARCH64_INSN_FUNCS(hint, 0xFFFFF01F, 0xD503201F)
__AARCH64_INSN_FUNCS(br, 0xFFFFFC1F, 0xD61F0000)
__AARCH64_INSN_FUNCS(blr, 0xFFFFFC1F, 0xD63F0000)
__AARCH64_INSN_FUNCS(ret, 0xFFFFFC1F, 0xD65F0000)
+__AARCH64_INSN_FUNCS(eret, 0xFFFFFFFF, 0xD69F03E0)
+__AARCH64_INSN_FUNCS(mrs, 0xFFF00000, 0xD5300000)
+__AARCH64_INSN_FUNCS(msr_imm, 0xFFF8F01F, 0xD500401F)
+__AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000)
#undef __AARCH64_INSN_FUNCS
@@ -286,6 +321,8 @@ bool aarch64_insn_is_branch_imm(u32 insn);
int aarch64_insn_read(void *addr, u32 *insnp);
int aarch64_insn_write(void *addr, u32 insn);
enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
+bool aarch64_insn_uses_literal(u32 insn);
+bool aarch64_insn_is_branch(u32 insn);
u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn);
u32 aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
u32 insn, u64 imm);
@@ -367,9 +404,13 @@ bool aarch32_insn_is_wide(u32 insn);
#define A32_RT_OFFSET 12
#define A32_RT2_OFFSET 0
+u32 aarch64_insn_extract_system_reg(u32 insn);
u32 aarch32_insn_extract_reg_num(u32 insn, int offset);
u32 aarch32_insn_mcr_extract_opc2(u32 insn);
u32 aarch32_insn_mcr_extract_crm(u32 insn);
+
+typedef bool (pstate_check_t)(unsigned long);
+extern pstate_check_t * const aarch32_opcode_cond_checks[16];
#endif /* __ASSEMBLY__ */
#endif /* __ASM_INSN_H */
diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index 8e8d30684392..b77197d941fc 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -1,10 +1,45 @@
#ifndef __ASM_IRQ_H
#define __ASM_IRQ_H
+#define IRQ_STACK_SIZE THREAD_SIZE
+#define IRQ_STACK_START_SP THREAD_START_SP
+
+#ifndef __ASSEMBLER__
+
+#include <linux/percpu.h>
+
#include <asm-generic/irq.h>
+#include <asm/thread_info.h>
struct pt_regs;
+DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
+
+/*
+ * The highest address on the stack, and the first to be used. Used to
+ * find the dummy-stack frame put down by el?_irq() in entry.S, which
+ * is structured as follows:
+ *
+ * ------------
+ * | | <- irq_stack_ptr
+ * top ------------
+ * | x19 | <- irq_stack_ptr - 0x08
+ * ------------
+ * | x29 | <- irq_stack_ptr - 0x10
+ * ------------
+ *
+ * where x19 holds a copy of the task stack pointer where the struct pt_regs
+ * from kernel_entry can be found.
+ *
+ */
+#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP)
+
+/*
+ * The offset from irq_stack_ptr where entry.S will store the original
+ * stack pointer. Used by unwind_frame() and dump_backtrace().
+ */
+#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08)))
+
extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
static inline int nr_legacy_irqs(void)
@@ -12,4 +47,14 @@ static inline int nr_legacy_irqs(void)
return 0;
}
+static inline bool on_irq_stack(unsigned long sp, int cpu)
+{
+ /* variable names the same as kernel/stacktrace.c */
+ unsigned long low = (unsigned long)per_cpu(irq_stack, cpu);
+ unsigned long high = low + IRQ_STACK_START_SP;
+
+ return (low <= sp && sp <= high);
+}
+
+#endif /* !__ASSEMBLER__ */
#endif
diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
index 2774fa384c47..71ad0f93eb71 100644
--- a/arch/arm64/include/asm/kasan.h
+++ b/arch/arm64/include/asm/kasan.h
@@ -7,13 +7,14 @@
#include <linux/linkage.h>
#include <asm/memory.h>
+#include <asm/pgtable-types.h>
/*
* KASAN_SHADOW_START: beginning of the kernel virtual addresses.
* KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses.
*/
#define KASAN_SHADOW_START (VA_START)
-#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1UL << (VA_BITS - 3)))
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
/*
* This value is used to map an address to the corresponding shadow
@@ -28,10 +29,12 @@
#define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << (64 - 3)))
void kasan_init(void);
+void kasan_copy_shadow(pgd_t *pgdir);
asmlinkage void kasan_early_init(void);
#else
static inline void kasan_init(void) { }
+static inline void kasan_copy_shadow(pgd_t *pgdir) { }
#endif
#endif
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index a459714ee29e..5c6375d8528b 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -79,5 +79,17 @@
#define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
#endif
+/*
+ * To make optimal use of block mappings when laying out the linear
+ * mapping, round down the base of physical memory to a size that can
+ * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
+ * (64k granule), or a multiple that can be mapped using contiguous bits
+ * in the page tables: 32 * PMD_SIZE (16k granule)
+ */
+#ifdef CONFIG_ARM64_64K_PAGES
+#define ARM64_MEMSTART_ALIGN SZ_512M
+#else
+#define ARM64_MEMSTART_ALIGN SZ_1G
+#endif
#endif /* __ASM_KERNEL_PGTABLE_H */
diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h
new file mode 100644
index 000000000000..1737aecfcc5e
--- /dev/null
+++ b/arch/arm64/include/asm/kprobes.h
@@ -0,0 +1,60 @@
+/*
+ * arch/arm64/include/asm/kprobes.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _ARM_KPROBES_H
+#define _ARM_KPROBES_H
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+#define MAX_INSN_SIZE 1
+
+#define flush_insn_slot(p) do { } while (0)
+#define kretprobe_blacklist_size 0
+
+#include <asm/probes.h>
+
+struct prev_kprobe {
+ struct kprobe *kp;
+ unsigned int status;
+};
+
+/* Single step context for kprobe */
+struct kprobe_step_ctx {
+ unsigned long ss_pending;
+ unsigned long match_addr;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+ unsigned int kprobe_status;
+ unsigned long saved_irqflag;
+ struct prev_kprobe prev_kprobe;
+ struct kprobe_step_ctx ss_ctx;
+ struct pt_regs jprobe_saved_regs;
+};
+
+void arch_remove_kprobe(struct kprobe *);
+int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
+int kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data);
+int kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr);
+int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr);
+void kretprobe_trampoline(void);
+void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
+
+#endif /* _ARM_KPROBES_H */
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 2d960f8588b0..8b709f53f874 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -83,17 +83,6 @@
#define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO)
-/* Hyp System Control Register (SCTLR_EL2) bits */
-#define SCTLR_EL2_EE (1 << 25)
-#define SCTLR_EL2_WXN (1 << 19)
-#define SCTLR_EL2_I (1 << 12)
-#define SCTLR_EL2_SA (1 << 3)
-#define SCTLR_EL2_C (1 << 2)
-#define SCTLR_EL2_A (1 << 1)
-#define SCTLR_EL2_M 1
-#define SCTLR_EL2_FLAGS (SCTLR_EL2_M | SCTLR_EL2_A | SCTLR_EL2_C | \
- SCTLR_EL2_SA | SCTLR_EL2_I)
-
/* TCR_EL2 Registers bits */
#define TCR_EL2_RES1 ((1 << 31) | (1 << 23))
#define TCR_EL2_TBI (1 << 20)
@@ -123,6 +112,7 @@
#define VTCR_EL2_SL0_LVL1 (1 << 6)
#define VTCR_EL2_T0SZ_MASK 0x3f
#define VTCR_EL2_T0SZ_40B 24
+#define VTCR_EL2_VS 19
/*
* We configure the Stage-2 page tables to always restrict the IPA space to be
@@ -167,7 +157,7 @@
#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
#define VTTBR_VMID_SHIFT (UL(48))
-#define VTTBR_VMID_MASK (UL(0xFF) << VTTBR_VMID_SHIFT)
+#define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
/* Hyp System Trap Register */
#define HSTR_EL2_T(x) (1 << x)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 5e377101f919..36a30c80032d 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -20,94 +20,38 @@
#include <asm/virt.h>
-/*
- * 0 is reserved as an invalid value.
- * Order *must* be kept in sync with the hyp switch code.
- */
-#define MPIDR_EL1 1 /* MultiProcessor Affinity Register */
-#define CSSELR_EL1 2 /* Cache Size Selection Register */
-#define SCTLR_EL1 3 /* System Control Register */
-#define ACTLR_EL1 4 /* Auxiliary Control Register */
-#define CPACR_EL1 5 /* Coprocessor Access Control */
-#define TTBR0_EL1 6 /* Translation Table Base Register 0 */
-#define TTBR1_EL1 7 /* Translation Table Base Register 1 */
-#define TCR_EL1 8 /* Translation Control Register */
-#define ESR_EL1 9 /* Exception Syndrome Register */
-#define AFSR0_EL1 10 /* Auxilary Fault Status Register 0 */
-#define AFSR1_EL1 11 /* Auxilary Fault Status Register 1 */
-#define FAR_EL1 12 /* Fault Address Register */
-#define MAIR_EL1 13 /* Memory Attribute Indirection Register */
-#define VBAR_EL1 14 /* Vector Base Address Register */
-#define CONTEXTIDR_EL1 15 /* Context ID Register */
-#define TPIDR_EL0 16 /* Thread ID, User R/W */
-#define TPIDRRO_EL0 17 /* Thread ID, User R/O */
-#define TPIDR_EL1 18 /* Thread ID, Privileged */
-#define AMAIR_EL1 19 /* Aux Memory Attribute Indirection Register */
-#define CNTKCTL_EL1 20 /* Timer Control Register (EL1) */
-#define PAR_EL1 21 /* Physical Address Register */
-#define MDSCR_EL1 22 /* Monitor Debug System Control Register */
-#define MDCCINT_EL1 23 /* Monitor Debug Comms Channel Interrupt Enable Reg */
-
-/* 32bit specific registers. Keep them at the end of the range */
-#define DACR32_EL2 24 /* Domain Access Control Register */
-#define IFSR32_EL2 25 /* Instruction Fault Status Register */
-#define FPEXC32_EL2 26 /* Floating-Point Exception Control Register */
-#define DBGVCR32_EL2 27 /* Debug Vector Catch Register */
-#define NR_SYS_REGS 28
-
-/* 32bit mapping */
-#define c0_MPIDR (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
-#define c0_CSSELR (CSSELR_EL1 * 2)/* Cache Size Selection Register */
-#define c1_SCTLR (SCTLR_EL1 * 2) /* System Control Register */
-#define c1_ACTLR (ACTLR_EL1 * 2) /* Auxiliary Control Register */
-#define c1_CPACR (CPACR_EL1 * 2) /* Coprocessor Access Control */
-#define c2_TTBR0 (TTBR0_EL1 * 2) /* Translation Table Base Register 0 */
-#define c2_TTBR0_high (c2_TTBR0 + 1) /* TTBR0 top 32 bits */
-#define c2_TTBR1 (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */
-#define c2_TTBR1_high (c2_TTBR1 + 1) /* TTBR1 top 32 bits */
-#define c2_TTBCR (TCR_EL1 * 2) /* Translation Table Base Control R. */
-#define c3_DACR (DACR32_EL2 * 2)/* Domain Access Control Register */
-#define c5_DFSR (ESR_EL1 * 2) /* Data Fault Status Register */
-#define c5_IFSR (IFSR32_EL2 * 2)/* Instruction Fault Status Register */
-#define c5_ADFSR (AFSR0_EL1 * 2) /* Auxiliary Data Fault Status R */
-#define c5_AIFSR (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */
-#define c6_DFAR (FAR_EL1 * 2) /* Data Fault Address Register */
-#define c6_IFAR (c6_DFAR + 1) /* Instruction Fault Address Register */
-#define c7_PAR (PAR_EL1 * 2) /* Physical Address Register */
-#define c7_PAR_high (c7_PAR + 1) /* PAR top 32 bits */
-#define c10_PRRR (MAIR_EL1 * 2) /* Primary Region Remap Register */
-#define c10_NMRR (c10_PRRR + 1) /* Normal Memory Remap Register */
-#define c12_VBAR (VBAR_EL1 * 2) /* Vector Base Address Register */
-#define c13_CID (CONTEXTIDR_EL1 * 2) /* Context ID Register */
-#define c13_TID_URW (TPIDR_EL0 * 2) /* Thread ID, User R/W */
-#define c13_TID_URO (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
-#define c13_TID_PRIV (TPIDR_EL1 * 2) /* Thread ID, Privileged */
-#define c10_AMAIR0 (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
-#define c10_AMAIR1 (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
-#define c14_CNTKCTL (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
-
-#define cp14_DBGDSCRext (MDSCR_EL1 * 2)
-#define cp14_DBGBCR0 (DBGBCR0_EL1 * 2)
-#define cp14_DBGBVR0 (DBGBVR0_EL1 * 2)
-#define cp14_DBGBXVR0 (cp14_DBGBVR0 + 1)
-#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
-#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
-#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
-
-#define NR_COPRO_REGS (NR_SYS_REGS * 2)
-
#define ARM_EXCEPTION_IRQ 0
#define ARM_EXCEPTION_TRAP 1
+/* The hyp-stub will return this for any kvm_call_hyp() call */
+#define ARM_EXCEPTION_HYP_GONE 2
#define KVM_ARM64_DEBUG_DIRTY_SHIFT 0
#define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
+#define kvm_ksym_ref(sym) phys_to_virt((u64)&sym - kimage_voffset)
+
#ifndef __ASSEMBLY__
+#if __GNUC__ > 4
+#define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR)
+#else
+/*
+ * GCC versions 4.9 and older will fold the constant below into the addend of
+ * the reference to 'sym' above if kvm_ksym_shift is declared static or if the
+ * constant is used directly. However, since we use the small code model for
+ * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair,
+ * with a +/- 4 GB range, resulting in linker relocation errors if the shift
+ * is sufficiently large. So prevent the compiler from folding the shift into
+ * the addend, by making the shift a variable with external linkage.
+ */
+__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR;
+#endif
+
struct kvm;
struct kvm_vcpu;
extern char __kvm_hyp_init[];
extern char __kvm_hyp_init_end[];
+extern char __kvm_hyp_reset[];
extern char __kvm_hyp_vector[];
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 25a40213bd9b..3066328cd86b 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -26,7 +26,6 @@
#include <asm/esr.h>
#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
#include <asm/kvm_mmio.h>
#include <asm/ptrace.h>
#include <asm/cputype.h>
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a35ce7266aac..3be7a7b52d80 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -25,7 +25,6 @@
#include <linux/types.h>
#include <linux/kvm_types.h>
#include <asm/kvm.h>
-#include <asm/kvm_asm.h>
#include <asm/kvm_mmio.h>
#define __KVM_HAVE_ARCH_INTC_INITIALIZED
@@ -45,6 +44,7 @@
int __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
int kvm_arch_dev_ioctl_check_extension(long ext);
+phys_addr_t kvm_hyp_reset_entry(void);
struct kvm_arch {
/* The VMID generation used for the virt. memory system */
@@ -85,6 +85,86 @@ struct kvm_vcpu_fault_info {
u64 hpfar_el2; /* Hyp IPA Fault Address Register */
};
+/*
+ * 0 is reserved as an invalid value.
+ * Order should be kept in sync with the save/restore code.
+ */
+enum vcpu_sysreg {
+ __INVALID_SYSREG__,
+ MPIDR_EL1, /* MultiProcessor Affinity Register */
+ CSSELR_EL1, /* Cache Size Selection Register */
+ SCTLR_EL1, /* System Control Register */
+ ACTLR_EL1, /* Auxiliary Control Register */
+ CPACR_EL1, /* Coprocessor Access Control */
+ TTBR0_EL1, /* Translation Table Base Register 0 */
+ TTBR1_EL1, /* Translation Table Base Register 1 */
+ TCR_EL1, /* Translation Control Register */
+ ESR_EL1, /* Exception Syndrome Register */
+ AFSR0_EL1, /* Auxilary Fault Status Register 0 */
+ AFSR1_EL1, /* Auxilary Fault Status Register 1 */
+ FAR_EL1, /* Fault Address Register */
+ MAIR_EL1, /* Memory Attribute Indirection Register */
+ VBAR_EL1, /* Vector Base Address Register */
+ CONTEXTIDR_EL1, /* Context ID Register */
+ TPIDR_EL0, /* Thread ID, User R/W */
+ TPIDRRO_EL0, /* Thread ID, User R/O */
+ TPIDR_EL1, /* Thread ID, Privileged */
+ AMAIR_EL1, /* Aux Memory Attribute Indirection Register */
+ CNTKCTL_EL1, /* Timer Control Register (EL1) */
+ PAR_EL1, /* Physical Address Register */
+ MDSCR_EL1, /* Monitor Debug System Control Register */
+ MDCCINT_EL1, /* Monitor Debug Comms Channel Interrupt Enable Reg */
+
+ /* 32bit specific registers. Keep them at the end of the range */
+ DACR32_EL2, /* Domain Access Control Register */
+ IFSR32_EL2, /* Instruction Fault Status Register */
+ FPEXC32_EL2, /* Floating-Point Exception Control Register */
+ DBGVCR32_EL2, /* Debug Vector Catch Register */
+
+ NR_SYS_REGS /* Nothing after this line! */
+};
+
+/* 32bit mapping */
+#define c0_MPIDR (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
+#define c0_CSSELR (CSSELR_EL1 * 2)/* Cache Size Selection Register */
+#define c1_SCTLR (SCTLR_EL1 * 2) /* System Control Register */
+#define c1_ACTLR (ACTLR_EL1 * 2) /* Auxiliary Control Register */
+#define c1_CPACR (CPACR_EL1 * 2) /* Coprocessor Access Control */
+#define c2_TTBR0 (TTBR0_EL1 * 2) /* Translation Table Base Register 0 */
+#define c2_TTBR0_high (c2_TTBR0 + 1) /* TTBR0 top 32 bits */
+#define c2_TTBR1 (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */
+#define c2_TTBR1_high (c2_TTBR1 + 1) /* TTBR1 top 32 bits */
+#define c2_TTBCR (TCR_EL1 * 2) /* Translation Table Base Control R. */
+#define c3_DACR (DACR32_EL2 * 2)/* Domain Access Control Register */
+#define c5_DFSR (ESR_EL1 * 2) /* Data Fault Status Register */
+#define c5_IFSR (IFSR32_EL2 * 2)/* Instruction Fault Status Register */
+#define c5_ADFSR (AFSR0_EL1 * 2) /* Auxiliary Data Fault Status R */
+#define c5_AIFSR (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */
+#define c6_DFAR (FAR_EL1 * 2) /* Data Fault Address Register */
+#define c6_IFAR (c6_DFAR + 1) /* Instruction Fault Address Register */
+#define c7_PAR (PAR_EL1 * 2) /* Physical Address Register */
+#define c7_PAR_high (c7_PAR + 1) /* PAR top 32 bits */
+#define c10_PRRR (MAIR_EL1 * 2) /* Primary Region Remap Register */
+#define c10_NMRR (c10_PRRR + 1) /* Normal Memory Remap Register */
+#define c12_VBAR (VBAR_EL1 * 2) /* Vector Base Address Register */
+#define c13_CID (CONTEXTIDR_EL1 * 2) /* Context ID Register */
+#define c13_TID_URW (TPIDR_EL0 * 2) /* Thread ID, User R/W */
+#define c13_TID_URO (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
+#define c13_TID_PRIV (TPIDR_EL1 * 2) /* Thread ID, Privileged */
+#define c10_AMAIR0 (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR1 (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
+#define c14_CNTKCTL (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
+
+#define cp14_DBGDSCRext (MDSCR_EL1 * 2)
+#define cp14_DBGBCR0 (DBGBCR0_EL1 * 2)
+#define cp14_DBGBVR0 (DBGBVR0_EL1 * 2)
+#define cp14_DBGBXVR0 (cp14_DBGBVR0 + 1)
+#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
+#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
+#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
+
+#define NR_COPRO_REGS (NR_SYS_REGS * 2)
+
struct kvm_cpu_context {
struct kvm_regs gp_regs;
union {
@@ -222,7 +302,7 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
-u64 kvm_call_hyp(void *hypfn, ...);
+u64 __kvm_call_hyp(void *hypfn, ...);
void force_vm_exit(const cpumask_t *mask);
void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
@@ -243,11 +323,25 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
* Call initialization code, and switch to the full blown
* HYP code.
*/
- kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
- hyp_stack_ptr, vector_ptr);
+ __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
+ hyp_stack_ptr, vector_ptr);
+}
+
+static inline void __cpu_init_stage2(void)
+{
+}
+
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+ phys_addr_t phys_idmap_start)
+{
+ /*
+ * Call reset code, and switch back to stub hyp vectors.
+ * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
+ */
+ __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
+ boot_pgd_ptr, phys_idmap_start);
}
-static inline void kvm_arch_hardware_disable(void) {}
static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
@@ -258,4 +352,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__)
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h
index 889c908ee631..fe612a962576 100644
--- a/arch/arm64/include/asm/kvm_mmio.h
+++ b/arch/arm64/include/asm/kvm_mmio.h
@@ -19,7 +19,6 @@
#define __ARM64_KVM_MMIO_H__
#include <linux/kvm_host.h>
-#include <asm/kvm_asm.h>
#include <asm/kvm_arm.h>
/*
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 61505676d085..342a5ac2f3da 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -20,6 +20,7 @@
#include <asm/page.h>
#include <asm/memory.h>
+#include <asm/cpufeature.h>
/*
* As we only have the TTBR0_EL2 register, we cannot express
@@ -98,6 +99,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
void kvm_clear_hyp_idmap(void);
@@ -158,7 +160,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
#define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT)
#endif
#define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT)
-#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
#define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
@@ -302,5 +303,12 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
merged_hyp_pgd[idmap_idx] = __pgd(__pa(boot_hyp_pgd) | PMD_TYPE_TABLE);
}
+static inline unsigned int kvm_get_vmid_bits(void)
+{
+ int reg = read_system_reg(SYS_ID_AA64MMFR1_EL1);
+
+ return (cpuid_feature_extract_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
+}
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index 3de42d68611d..23acc00be32d 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -26,6 +26,7 @@ __asm__(".arch_extension lse");
/* Macro for constructing calls to out-of-line ll/sc atomics */
#define __LL_SC_CALL(op) "bl\t" __stringify(__LL_SC_PREFIX(op)) "\n"
+#define __LL_SC_CLOBBERS "x16", "x17", "x30"
/* In-line patching at runtime */
#define ARM64_LSE_ATOMIC_INSN(llsc, lse) \
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 853953cd1f08..d776037d199f 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -24,6 +24,7 @@
#include <linux/compiler.h>
#include <linux/const.h>
#include <linux/types.h>
+#include <asm/bug.h>
#include <asm/sizes.h>
/*
@@ -45,15 +46,15 @@
* VA_START - the first kernel virtual address.
* TASK_SIZE - the maximum size of a user space task.
* TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
- * The module space lives between the addresses given by TASK_SIZE
- * and PAGE_OFFSET - it must be within 128MB of the kernel text.
*/
#define VA_BITS (CONFIG_ARM64_VA_BITS)
#define VA_START (UL(0xffffffffffffffff) << VA_BITS)
#define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1))
-#define MODULES_END (PAGE_OFFSET)
-#define MODULES_VADDR (MODULES_END - SZ_64M)
-#define PCI_IO_END (MODULES_VADDR - SZ_2M)
+#define KIMAGE_VADDR (MODULES_END)
+#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
+#define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE)
+#define MODULES_VSIZE (SZ_128M)
+#define PCI_IO_END (PAGE_OFFSET - SZ_2M)
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP (PCI_IO_START - SZ_2M)
#define TASK_SIZE_64 (UL(1) << VA_BITS)
@@ -70,13 +71,31 @@
#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4))
+#define KERNEL_START _text
+#define KERNEL_END _end
+
+/*
+ * The size of the KASAN shadow region. This should be 1/8th of the
+ * size of the entire kernel virtual address space.
+ */
+#ifdef CONFIG_KASAN
+#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3))
+#else
+#define KASAN_SHADOW_SIZE (0)
+#endif
+
/*
* Physical vs virtual RAM address space conversion. These are
* private definitions which should NOT be used outside memory.h
* files. Use virt_to_phys/phys_to_virt/__pa/__va instead.
*/
-#define __virt_to_phys(x) (((phys_addr_t)(x) - PAGE_OFFSET + PHYS_OFFSET))
-#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
+#define __virt_to_phys(x) ({ \
+ phys_addr_t __x = (phys_addr_t)(x); \
+ __x & BIT(VA_BITS - 1) ? (__x & ~PAGE_OFFSET) + PHYS_OFFSET : \
+ (__x - kimage_voffset); })
+
+#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)
+#define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset))
/*
* Convert a page to/from a physical address
@@ -100,19 +119,40 @@
#define MT_S2_NORMAL 0xf
#define MT_S2_DEVICE_nGnRE 0x1
+#ifdef CONFIG_ARM64_4K_PAGES
+#define IOREMAP_MAX_ORDER (PUD_SHIFT)
+#else
+#define IOREMAP_MAX_ORDER (PMD_SHIFT)
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+#define __early_init_dt_declare_initrd(__start, __end) \
+ do { \
+ initrd_start = (__start); \
+ initrd_end = (__end); \
+ } while (0)
+#endif
+
#ifndef __ASSEMBLY__
-extern phys_addr_t memstart_addr;
+#include <linux/bitops.h>
+#include <linux/mmdebug.h>
+
+extern s64 memstart_addr;
/* PHYS_OFFSET - the physical address of the start of memory. */
-#define PHYS_OFFSET ({ memstart_addr; })
+#define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
+
+/* the virtual base of the kernel image (minus TEXT_OFFSET) */
+extern u64 kimage_vaddr;
+
+/* the offset between the kernel virtual and physical mappings */
+extern u64 kimage_voffset;
/*
- * The maximum physical address that the linear direct mapping
- * of system RAM can cover. (PAGE_OFFSET can be interpreted as
- * a 2's complement signed quantity and negated to derive the
- * maximum size of the linear mapping.)
+ * Allow all memory at the discovery stage. We will clip it later.
*/
-#define MAX_MEMBLOCK_ADDR ({ memstart_addr - PAGE_OFFSET - 1; })
+#define MIN_MEMBLOCK_ADDR 0
+#define MAX_MEMBLOCK_ADDR U64_MAX
/*
* PFNs are used to describe any physical page; this means
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 24165784b803..a00f7cf35bbd 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -27,6 +27,7 @@
#include <asm-generic/mm_hooks.h>
#include <asm/cputype.h>
#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
#ifdef CONFIG_PID_IN_CONTEXTIDR
static inline void contextidr_thread_switch(struct task_struct *next)
@@ -48,7 +49,7 @@ static inline void contextidr_thread_switch(struct task_struct *next)
*/
static inline void cpu_set_reserved_ttbr0(void)
{
- unsigned long ttbr = page_to_phys(empty_zero_page);
+ unsigned long ttbr = virt_to_phys(empty_zero_page);
asm(
" msr ttbr0_el1, %0 // set TTBR0\n"
@@ -73,7 +74,7 @@ static inline bool __cpu_uses_extended_idmap(void)
/*
* Set TCR.T0SZ to its default value (based on VA_BITS)
*/
-static inline void cpu_set_default_tcr_t0sz(void)
+static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
{
unsigned long tcr;
@@ -86,7 +87,62 @@ static inline void cpu_set_default_tcr_t0sz(void)
" msr tcr_el1, %0 ;"
" isb"
: "=&r" (tcr)
- : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
+ : "r"(t0sz), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
+}
+
+#define cpu_set_default_tcr_t0sz() __cpu_set_tcr_t0sz(TCR_T0SZ(VA_BITS))
+#define cpu_set_idmap_tcr_t0sz() __cpu_set_tcr_t0sz(idmap_t0sz)
+
+/*
+ * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm.
+ *
+ * The idmap lives in the same VA range as userspace, but uses global entries
+ * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from
+ * speculative TLB fetches, we must temporarily install the reserved page
+ * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ.
+ *
+ * If current is a not a user task, the mm covers the TTBR1_EL1 page tables,
+ * which should not be installed in TTBR0_EL1. In this case we can leave the
+ * reserved page tables in place.
+ */
+static inline void cpu_uninstall_idmap(void)
+{
+ struct mm_struct *mm = current->active_mm;
+
+ cpu_set_reserved_ttbr0();
+ local_flush_tlb_all();
+ cpu_set_default_tcr_t0sz();
+
+ if (mm != &init_mm)
+ cpu_switch_mm(mm->pgd, mm);
+}
+
+static inline void cpu_install_idmap(void)
+{
+ cpu_set_reserved_ttbr0();
+ local_flush_tlb_all();
+ cpu_set_idmap_tcr_t0sz();
+
+ cpu_switch_mm(idmap_pg_dir, &init_mm);
+}
+
+/*
+ * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
+ * avoiding the possibility of conflicting TLB entries being allocated.
+ */
+static inline void cpu_replace_ttbr1(pgd_t *pgd)
+{
+ typedef void (ttbr_replace_func)(phys_addr_t);
+ extern ttbr_replace_func idmap_cpu_replace_ttbr1;
+ ttbr_replace_func *replace_phys;
+
+ phys_addr_t pgd_phys = virt_to_phys(pgd);
+
+ replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1);
+
+ cpu_install_idmap();
+ replace_phys(pgd_phys);
+ cpu_uninstall_idmap();
}
/*
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index e80e232b730e..e12af6754634 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -20,4 +20,21 @@
#define MODULE_ARCH_VERMAGIC "aarch64"
+#ifdef CONFIG_ARM64_MODULE_PLTS
+struct mod_arch_specific {
+ struct elf64_shdr *plt;
+ int plt_num_entries;
+ int plt_max_entries;
+};
+#endif
+
+u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
+ Elf64_Sym *sym);
+
+#ifdef CONFIG_RANDOMIZE_BASE
+extern u64 module_alloc_base;
+#else
+#define module_alloc_base ((u64)_etext - MODULES_VSIZE)
+#endif
+
#endif /* __ASM_MODULE_H */
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 9b2f5a9d019d..fbafd0ad16df 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -19,6 +19,8 @@
#ifndef __ASM_PAGE_H
#define __ASM_PAGE_H
+#include <linux/const.h>
+
/* PAGE_SHIFT determines the page size */
/* CONT_SHIFT determines the number of pages which can be tracked together */
#ifdef CONFIG_ARM64_64K_PAGES
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index c15053902942..ff98585d085a 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -42,11 +42,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
free_page((unsigned long)pmd);
}
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
{
- set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
+ set_pud(pud, __pud(pmd | prot));
}
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ __pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE);
+}
+#else
+static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
+{
+ BUILD_BUG();
+}
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#if CONFIG_PGTABLE_LEVELS > 3
@@ -62,11 +71,20 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
free_page((unsigned long)pud);
}
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
{
- set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
+ set_pgd(pgdp, __pgd(pud | prot));
}
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ __pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE);
+}
+#else
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
+{
+ BUILD_BUG();
+}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
extern pgd_t *pgd_alloc(struct mm_struct *mm);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index b9da9545b442..9786f770088d 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -90,7 +90,23 @@
/*
* Contiguous page definitions.
*/
-#define CONT_PTES (_AC(1, UL) << CONT_SHIFT)
+#ifdef CONFIG_ARM64_64K_PAGES
+#define CONT_PTE_SHIFT 5
+#define CONT_PMD_SHIFT 5
+#elif defined(CONFIG_ARM64_16K_PAGES)
+#define CONT_PTE_SHIFT 7
+#define CONT_PMD_SHIFT 5
+#else
+#define CONT_PTE_SHIFT 4
+#define CONT_PMD_SHIFT 4
+#endif
+
+#define CONT_PTES (1 << CONT_PTE_SHIFT)
+#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE)
+#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1))
+#define CONT_PMDS (1 << CONT_PMD_SHIFT)
+#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE)
+#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1))
/* the the numerical offset of the PTE within a range of CONT_PTES */
#define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1))
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 67c2ad6d33b7..9a09ccf7122d 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -36,19 +36,13 @@
*
* VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
* (rounded up to PUD_SIZE).
- * VMALLOC_START: beginning of the kernel VA space
+ * VMALLOC_START: beginning of the kernel vmalloc space
* VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
* fixed mappings and modules
*/
#define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
-#ifndef CONFIG_KASAN
-#define VMALLOC_START (VA_START)
-#else
-#include <asm/kasan.h>
-#define VMALLOC_START (KASAN_SHADOW_END + SZ_64K)
-#endif
-
+#define VMALLOC_START (MODULES_END)
#define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
#define VMEMMAP_START (VMALLOC_END + SZ_64K)
@@ -59,6 +53,7 @@
#ifndef __ASSEMBLY__
+#include <asm/fixmap.h>
#include <linux/mmdebug.h>
extern void __pte_error(const char *file, int line, unsigned long val);
@@ -123,8 +118,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
*/
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr) (empty_zero_page)
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page)
#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte))
@@ -136,16 +131,6 @@ extern struct page *empty_zero_page;
#define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0))
#define pte_page(pte) (pfn_to_page(pte_pfn(pte)))
-/* Find an entry in the third-level page table. */
-#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-
-#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr))
-
-#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr))
-#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr))
-#define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
-
/*
* The following only work if pte_present(). Undefined behaviour otherwise.
*/
@@ -168,6 +153,16 @@ extern struct page *empty_zero_page;
#define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID))
#define pte_valid_not_user(pte) \
((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID)
+#define pte_valid_young(pte) \
+ ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF))
+
+/*
+ * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
+ * so that we don't erroneously return false for pages that have been
+ * remapped as PROT_NONE but are yet to be flushed from the TLB.
+ */
+#define pte_accessible(mm, pte) \
+ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte))
static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
{
@@ -218,7 +213,8 @@ static inline pte_t pte_mkspecial(pte_t pte)
static inline pte_t pte_mkcont(pte_t pte)
{
- return set_pte_bit(pte, __pgprot(PTE_CONT));
+ pte = set_pte_bit(pte, __pgprot(PTE_CONT));
+ return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
}
static inline pte_t pte_mknoncont(pte_t pte)
@@ -226,6 +222,11 @@ static inline pte_t pte_mknoncont(pte_t pte)
return clear_pte_bit(pte, __pgprot(PTE_CONT));
}
+static inline pmd_t pmd_mkcont(pmd_t pmd)
+{
+ return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
+}
+
static inline void set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
@@ -299,7 +300,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
/*
* Hugetlb definitions.
*/
-#define HUGE_MAX_HSTATE 2
+#define HUGE_MAX_HSTATE 4
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
@@ -354,6 +355,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
@@ -425,13 +427,31 @@ static inline void pmd_clear(pmd_t *pmdp)
set_pmd(pmdp, __pmd(0));
}
-static inline pte_t *pmd_page_vaddr(pmd_t pmd)
+static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
{
- return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK);
+ return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK;
}
+/* Find an entry in the third-level page table. */
+#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+
+#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t))
+#define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr))))
+
+#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr))
+#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr))
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { } while (0)
+
+#define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr))
+#define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr))
+#define pte_clear_fixmap() clear_fixmap(FIX_PTE)
+
#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
+/* use ONLY for statically allocated translation tables */
+#define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
+
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
@@ -458,21 +478,37 @@ static inline void pud_clear(pud_t *pudp)
set_pud(pudp, __pud(0));
}
-static inline pmd_t *pud_page_vaddr(pud_t pud)
+static inline phys_addr_t pud_page_paddr(pud_t pud)
{
- return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK);
+ return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK;
}
/* Find an entry in the second-level page table. */
#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
-{
- return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr);
-}
+#define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t))
+#define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr))))
+
+#define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
+#define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr))
+#define pmd_clear_fixmap() clear_fixmap(FIX_PMD)
#define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
+/* use ONLY for statically allocated translation tables */
+#define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
+
+#else
+
+#define pud_page_paddr(pud) ({ BUILD_BUG(); 0; })
+
+/* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
+#define pmd_set_fixmap(addr) NULL
+#define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp)
+#define pmd_clear_fixmap()
+
+#define pmd_offset_kimg(dir,addr) ((pmd_t *)dir)
+
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#if CONFIG_PGTABLE_LEVELS > 3
@@ -494,21 +530,37 @@ static inline void pgd_clear(pgd_t *pgdp)
set_pgd(pgdp, __pgd(0));
}
-static inline pud_t *pgd_page_vaddr(pgd_t pgd)
+static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
{
- return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK);
+ return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK;
}
/* Find an entry in the frst-level page table. */
#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
-static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
-{
- return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
-}
+#define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t))
+#define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr))))
+
+#define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr))
+#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr))
+#define pud_clear_fixmap() clear_fixmap(FIX_PUD)
#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
+/* use ONLY for statically allocated translation tables */
+#define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr))))
+
+#else
+
+#define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;})
+
+/* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
+#define pud_set_fixmap(addr) NULL
+#define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp)
+#define pud_clear_fixmap()
+
+#define pud_offset_kimg(dir,addr) ((pud_t *)dir)
+
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
@@ -516,11 +568,16 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
/* to find an entry in a page-table-directory */
#define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
-#define pgd_offset(mm, addr) ((mm)->pgd+pgd_index(addr))
+#define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr))
+
+#define pgd_offset(mm, addr) (pgd_offset_raw((mm)->pgd, (addr)))
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
+#define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr))
+#define pgd_clear_fixmap() clear_fixmap(FIX_PGD)
+
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
@@ -681,7 +738,8 @@ extern int kern_addr_valid(unsigned long addr);
#include <asm-generic/pgtable.h>
-#define pgtable_cache_init() do { } while (0)
+void pgd_cache_init(void);
+#define pgtable_cache_init pgd_cache_init
/*
* On AArch64, the cache coherency is handled via the set_pte_at() function.
diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h
new file mode 100644
index 000000000000..5af574d632fa
--- /dev/null
+++ b/arch/arm64/include/asm/probes.h
@@ -0,0 +1,35 @@
+/*
+ * arch/arm64/include/asm/probes.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef _ARM_PROBES_H
+#define _ARM_PROBES_H
+
+#include <asm/opcodes.h>
+
+struct kprobe;
+struct arch_specific_insn;
+
+typedef u32 kprobe_opcode_t;
+typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *);
+
+/* architecture specific copy of original instruction */
+struct arch_specific_insn {
+ kprobe_opcode_t *insn;
+ pstate_check_t *pstate_cc;
+ kprobes_handler_t *handler;
+ /* restore address after step xol */
+ unsigned long restore;
+};
+
+#endif
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index d08559528927..4be934fde409 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -29,8 +29,10 @@
#include <linux/string.h>
+#include <asm/alternative.h>
#include <asm/fpsimd.h>
#include <asm/hw_breakpoint.h>
+#include <asm/lse.h>
#include <asm/pgtable-hwdef.h>
#include <asm/ptrace.h>
#include <asm/types.h>
@@ -177,9 +179,11 @@ static inline void prefetchw(const void *ptr)
}
#define ARCH_HAS_SPINLOCK_PREFETCH
-static inline void spin_lock_prefetch(const void *x)
+static inline void spin_lock_prefetch(const void *ptr)
{
- prefetchw(x);
+ asm volatile(ARM64_LSE_ATOMIC_INSN(
+ "prfm pstl1strm, %a0",
+ "nop") : : "p" (ptr));
}
#define HAVE_ARCH_PICK_MMAP_LAYOUT
@@ -187,5 +191,6 @@ static inline void spin_lock_prefetch(const void *x)
#endif
int cpu_enable_pan(void *__unused);
+int cpu_enable_uao(void *__unused);
#endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 7f94755089e2..1528d52eb8c0 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -121,6 +121,8 @@ struct pt_regs {
u64 unused; // maintain 16 byte alignment
};
+#define MAX_REG_OFFSET offsetof(struct pt_regs, pstate)
+
#define arch_has_single_step() (1)
#ifdef CONFIG_COMPAT
@@ -146,9 +148,57 @@ struct pt_regs {
#define fast_interrupts_enabled(regs) \
(!((regs)->pstate & PSR_F_BIT))
-#define user_stack_pointer(regs) \
+#define GET_USP(regs) \
(!compat_user_mode(regs) ? (regs)->sp : (regs)->compat_sp)
+#define SET_USP(ptregs, value) \
+ (!compat_user_mode(regs) ? ((regs)->sp = value) : ((regs)->compat_sp = value))
+
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned int n);
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs: pt_regs from which register value is gotten
+ * @offset: offset of the register.
+ *
+ * regs_get_register returns the value of a register whose offset from @regs.
+ * The @offset is the offset of the register in struct pt_regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline u64 regs_get_register(struct pt_regs *regs, unsigned int offset)
+{
+ u64 val = 0;
+
+ offset >>= 3;
+ switch (offset) {
+ case 0 ... 30:
+ val = regs->regs[offset];
+ break;
+ case offsetof(struct pt_regs, sp) >> 3:
+ val = regs->sp;
+ break;
+ case offsetof(struct pt_regs, pc) >> 3:
+ val = regs->pc;
+ break;
+ case offsetof(struct pt_regs, pstate) >> 3:
+ val = regs->pstate;
+ break;
+ default:
+ val = 0;
+ }
+
+ return val;
+}
+
+/* Valid only for Kernel mode traps. */
+static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ return regs->sp;
+}
+
static inline unsigned long regs_return_value(struct pt_regs *regs)
{
return regs->regs[0];
@@ -158,8 +208,15 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
struct task_struct;
int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task);
-#define instruction_pointer(regs) ((unsigned long)(regs)->pc)
+#define GET_IP(regs) ((unsigned long)(regs)->pc)
+#define SET_IP(regs, value) ((regs)->pc = ((u64) (value)))
+
+#define GET_FP(ptregs) ((unsigned long)(ptregs)->regs[29])
+#define SET_FP(ptregs, value) ((ptregs)->regs[29] = ((u64) (value)))
+
+#include <asm-generic/ptrace.h>
+#undef profile_pc
extern unsigned long profile_pc(struct pt_regs *regs);
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h
index 4df608a8459e..e368a55ebd22 100644
--- a/arch/arm64/include/asm/shmparam.h
+++ b/arch/arm64/include/asm/shmparam.h
@@ -21,7 +21,7 @@
* alignment value. Since we don't have aliasing D-caches, the rest of
* the time we can safely use PAGE_SIZE.
*/
-#define COMPAT_SHMLBA 0x4000
+#define COMPAT_SHMLBA (4 * PAGE_SIZE)
#include <asm-generic/shmparam.h>
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index d9c3d6a6100a..2013a4dc5124 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -64,6 +64,15 @@ extern void secondary_entry(void);
extern void arch_send_call_function_single_ipi(int cpu);
extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
+#else
+static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+{
+ BUILD_BUG();
+}
+#endif
+
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index 499e8de33a00..53ee219e76a7 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -26,9 +26,28 @@
* The memory barriers are implicit with the load-acquire and store-release
* instructions.
*/
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ unsigned int tmp;
+ arch_spinlock_t lockval;
-#define arch_spin_unlock_wait(lock) \
- do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+ asm volatile(
+" sevl\n"
+"1: wfe\n"
+"2: ldaxr %w0, %2\n"
+" eor %w1, %w0, %w0, ror #16\n"
+" cbnz %w1, 1b\n"
+ ARM64_LSE_ATOMIC_INSN(
+ /* LL/SC */
+" stxr %w1, %w0, %2\n"
+" cbnz %w1, 2b\n", /* Serialise against any concurrent lockers */
+ /* LSE atomics */
+" nop\n"
+" nop\n")
+ : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
+ :
+ : "memory");
+}
#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 7318f6d54aa9..801a16dbbdf6 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -16,14 +16,19 @@
#ifndef __ASM_STACKTRACE_H
#define __ASM_STACKTRACE_H
+struct task_struct;
+
struct stackframe {
unsigned long fp;
unsigned long sp;
unsigned long pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ unsigned int graph;
+#endif
};
-extern int unwind_frame(struct stackframe *frame);
-extern void walk_stackframe(struct stackframe *frame,
+extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
+extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
int (*fn)(struct stackframe *, void *), void *data);
#endif /* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 59a5b0f1e81c..024d623f662e 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -1,7 +1,8 @@
#ifndef __ASM_SUSPEND_H
#define __ASM_SUSPEND_H
-#define NR_CTX_REGS 11
+#define NR_CTX_REGS 10
+#define NR_CALLEE_SAVED_REGS 12
/*
* struct cpu_suspend_ctx must be 16-byte aligned since it is allocated on
@@ -16,11 +17,34 @@ struct cpu_suspend_ctx {
u64 sp;
} __aligned(16);
-struct sleep_save_sp {
- phys_addr_t *save_ptr_stash;
- phys_addr_t save_ptr_stash_phys;
+/*
+ * Memory to save the cpu state is allocated on the stack by
+ * __cpu_suspend_enter()'s caller, and populated by __cpu_suspend_enter().
+ * This data must survive until cpu_resume() is called.
+ *
+ * This struct desribes the size and the layout of the saved cpu state.
+ * The layout of the callee_saved_regs is defined by the implementation
+ * of __cpu_suspend_enter(), and cpu_resume(). This struct must be passed
+ * in by the caller as __cpu_suspend_enter()'s stack-frame is gone once it
+ * returns, and the data would be subsequently corrupted by the call to the
+ * finisher.
+ */
+struct sleep_stack_data {
+ struct cpu_suspend_ctx system_regs;
+ unsigned long callee_saved_regs[NR_CALLEE_SAVED_REGS];
};
+extern unsigned long *sleep_save_stash;
+
extern int cpu_suspend(unsigned long arg, int (*fn)(unsigned long));
extern void cpu_resume(void);
+int __cpu_suspend_enter(struct sleep_stack_data *state);
+void __cpu_suspend_exit(void);
+void _cpu_resume(void);
+
+int swsusp_arch_suspend(void);
+int swsusp_arch_resume(void);
+int arch_hibernation_header_save(void *addr, unsigned int max_size);
+int arch_hibernation_header_restore(void *addr);
+
#endif
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d48ab5b41f52..0961a24e8d48 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -20,6 +20,8 @@
#ifndef __ASM_SYSREG_H
#define __ASM_SYSREG_H
+#include <linux/stringify.h>
+
#include <asm/opcodes.h>
/*
@@ -70,20 +72,35 @@
#define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0)
#define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1)
+#define SYS_ID_AA64MMFR2_EL1 sys_reg(3, 0, 0, 7, 2)
#define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0)
#define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1)
#define SYS_DCZID_EL0 sys_reg(3, 3, 0, 0, 7)
#define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4)
+#define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3)
#define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\
(!!x)<<8 | 0x1f)
+#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\
+ (!!x)<<8 | 0x1f)
+
+/* Common SCTLR_ELx flags. */
+#define SCTLR_ELx_EE (1 << 25)
+#define SCTLR_ELx_I (1 << 12)
+#define SCTLR_ELx_SA (1 << 3)
+#define SCTLR_ELx_C (1 << 2)
+#define SCTLR_ELx_A (1 << 1)
+#define SCTLR_ELx_M 1
-/* SCTLR_EL1 */
-#define SCTLR_EL1_CP15BEN (0x1 << 5)
-#define SCTLR_EL1_SED (0x1 << 8)
-#define SCTLR_EL1_SPAN (0x1 << 23)
+#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
+ SCTLR_ELx_SA | SCTLR_ELx_I)
+
+/* SCTLR_EL1 specific flags. */
+#define SCTLR_EL1_SPAN (1 << 23)
+#define SCTLR_EL1_SED (1 << 8)
+#define SCTLR_EL1_CP15BEN (1 << 5)
/* id_aa64isar0 */
@@ -135,6 +152,9 @@
#define ID_AA64MMFR1_VMIDBITS_SHIFT 4
#define ID_AA64MMFR1_HADBS_SHIFT 0
+/* id_aa64mmfr2 */
+#define ID_AA64MMFR2_UAO_SHIFT 4
+
/* id_aa64dfr0 */
#define ID_AA64DFR0_CTX_CMPS_SHIFT 28
#define ID_AA64DFR0_WRPS_SHIFT 20
@@ -194,32 +214,34 @@
#ifdef __ASSEMBLY__
.irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
- .equ __reg_num_x\num, \num
+ .equ .L__reg_num_x\num, \num
.endr
- .equ __reg_num_xzr, 31
+ .equ .L__reg_num_xzr, 31
.macro mrs_s, rt, sreg
- .inst 0xd5200000|(\sreg)|(__reg_num_\rt)
+ .inst 0xd5200000|(\sreg)|(.L__reg_num_\rt)
.endm
.macro msr_s, sreg, rt
- .inst 0xd5000000|(\sreg)|(__reg_num_\rt)
+ .inst 0xd5000000|(\sreg)|(.L__reg_num_\rt)
.endm
#else
+#include <linux/types.h>
+
asm(
" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n"
-" .equ __reg_num_x\\num, \\num\n"
+" .equ .L__reg_num_x\\num, \\num\n"
" .endr\n"
-" .equ __reg_num_xzr, 31\n"
+" .equ .L__reg_num_xzr, 31\n"
"\n"
" .macro mrs_s, rt, sreg\n"
-" .inst 0xd5200000|(\\sreg)|(__reg_num_\\rt)\n"
+" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n"
" .endm\n"
"\n"
" .macro msr_s, sreg, rt\n"
-" .inst 0xd5000000|(\\sreg)|(__reg_num_\\rt)\n"
+" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n"
" .endm\n"
);
@@ -232,6 +254,23 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
val |= set;
asm volatile("msr sctlr_el1, %0" : : "r" (val));
}
+
+/*
+ * Unlike read_cpuid, calls to read_sysreg are never expected to be
+ * optimized away or replaced with synthetic values.
+ */
+#define read_sysreg(r) ({ \
+ u64 __val; \
+ asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \
+ __val; \
+})
+
+#define write_sysreg(v, r) do { \
+ u64 __val = (u64)v; \
+ asm volatile("msr " __stringify(r) ", %0" \
+ : : "r" (__val)); \
+} while (0)
+
#endif
#endif /* __ASM_SYSREG_H */
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 90c7ff233735..abd64bd1f6d9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -73,10 +73,16 @@ register unsigned long current_stack_pointer asm ("sp");
*/
static inline struct thread_info *current_thread_info(void) __attribute_const__;
+/*
+ * struct thread_info can be accessed directly via sp_el0.
+ */
static inline struct thread_info *current_thread_info(void)
{
- return (struct thread_info *)
- (current_stack_pointer & ~(THREAD_SIZE - 1));
+ unsigned long sp_el0;
+
+ asm ("mrs %0, sp_el0" : "=r" (sp_el0));
+
+ return (struct thread_info *)sp_el0;
}
#define thread_saved_pc(tsk) \
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index b2ede967fe7d..c3d445b42351 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -36,11 +36,11 @@
#define VERIFY_WRITE 1
/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue. No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of pairs of relative offsets: the first
+ * is the relative offset to an instruction that is allowed to fault,
+ * and the second is the relative offset at which the program should
+ * continue. No registers are modified, so it is entirely up to the
+ * continuation code to figure out what to do.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
@@ -50,9 +50,11 @@
struct exception_table_entry
{
- unsigned long insn, fixup;
+ int insn, fixup;
};
+#define ARCH_HAS_RELATIVE_EXTABLE
+
extern int fixup_exception(struct pt_regs *regs);
#define KERNEL_DS (-1UL)
@@ -64,6 +66,16 @@ extern int fixup_exception(struct pt_regs *regs);
static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
+
+ /*
+ * Enable/disable UAO so that copy_to_user() etc can access
+ * kernel memory with the unprivileged instructions.
+ */
+ if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS)
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+ else
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO,
+ CONFIG_ARM64_UAO));
}
#define segment_eq(a, b) ((a) == (b))
@@ -105,6 +117,12 @@ static inline void set_fs(mm_segment_t fs)
#define access_ok(type, addr, size) __range_ok(addr, size)
#define user_addr_max get_fs
+#define _ASM_EXTABLE(from, to) \
+ " .pushsection __ex_table, \"a\"\n" \
+ " .align 3\n" \
+ " .long (" #from " - .), (" #to " - .)\n" \
+ " .popsection\n"
+
/*
* The "__xxx" versions of the user access functions do not verify the address
* space - it must have been done previously with a separate "access_ok()"
@@ -113,9 +131,10 @@ static inline void set_fs(mm_segment_t fs)
* The "__xxx_error" versions set the third argument to -EFAULT if an error
* occurs, and leave it unchanged on success.
*/
-#define __get_user_asm(instr, reg, x, addr, err) \
+#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature) \
asm volatile( \
- "1: " instr " " reg "1, [%2]\n" \
+ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \
+ alt_instr " " reg "1, [%2]\n", feature) \
"2:\n" \
" .section .fixup, \"ax\"\n" \
" .align 2\n" \
@@ -123,10 +142,7 @@ static inline void set_fs(mm_segment_t fs)
" mov %1, #0\n" \
" b 2b\n" \
" .previous\n" \
- " .section __ex_table,\"a\"\n" \
- " .align 3\n" \
- " .quad 1b, 3b\n" \
- " .previous" \
+ _ASM_EXTABLE(1b, 3b) \
: "+r" (err), "=&r" (x) \
: "r" (addr), "i" (-EFAULT))
@@ -134,26 +150,30 @@ static inline void set_fs(mm_segment_t fs)
do { \
unsigned long __gu_val; \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
CONFIG_ARM64_PAN)); \
switch (sizeof(*(ptr))) { \
case 1: \
- __get_user_asm("ldrb", "%w", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 2: \
- __get_user_asm("ldrh", "%w", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 4: \
- __get_user_asm("ldr", "%w", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 8: \
- __get_user_asm("ldr", "%", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldr", "ldtr", "%", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
default: \
BUILD_BUG(); \
} \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
CONFIG_ARM64_PAN)); \
} while (0)
@@ -181,19 +201,17 @@ do { \
((x) = 0, -EFAULT); \
})
-#define __put_user_asm(instr, reg, x, addr, err) \
+#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \
asm volatile( \
- "1: " instr " " reg "1, [%2]\n" \
+ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \
+ alt_instr " " reg "1, [%2]\n", feature) \
"2:\n" \
" .section .fixup,\"ax\"\n" \
" .align 2\n" \
"3: mov %w0, %3\n" \
" b 2b\n" \
" .previous\n" \
- " .section __ex_table,\"a\"\n" \
- " .align 3\n" \
- " .quad 1b, 3b\n" \
- " .previous" \
+ _ASM_EXTABLE(1b, 3b) \
: "+r" (err) \
: "r" (x), "r" (addr), "i" (-EFAULT))
@@ -201,25 +219,29 @@ do { \
do { \
__typeof__(*(ptr)) __pu_val = (x); \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
CONFIG_ARM64_PAN)); \
switch (sizeof(*(ptr))) { \
case 1: \
- __put_user_asm("strb", "%w", __pu_val, (ptr), (err)); \
+ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 2: \
- __put_user_asm("strh", "%w", __pu_val, (ptr), (err)); \
+ __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 4: \
- __put_user_asm("str", "%w", __pu_val, (ptr), (err)); \
+ __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 8: \
- __put_user_asm("str", "%", __pu_val, (ptr), (err)); \
+ __put_user_asm("str", "sttr", "%", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
default: \
BUILD_BUG(); \
} \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
CONFIG_ARM64_PAN)); \
} while (0)
@@ -247,24 +269,39 @@ do { \
-EFAULT; \
})
-extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
+static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ check_object_size(to, n, false);
+ return __arch_copy_from_user(to, from, n);
+}
+
+static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ check_object_size(from, n, true);
+ return __arch_copy_to_user(to, from, n);
+}
+
static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
{
- if (access_ok(VERIFY_READ, from, n))
- n = __copy_from_user(to, from, n);
- else /* security hole - plug it */
+ if (access_ok(VERIFY_READ, from, n)) {
+ check_object_size(to, n, false);
+ n = __arch_copy_from_user(to, from, n);
+ } else /* security hole - plug it */
memset(to, 0, n);
return n;
}
static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n)
{
- if (access_ok(VERIFY_WRITE, to, n))
- n = __copy_to_user(to, from, n);
+ if (access_ok(VERIFY_WRITE, to, n)) {
+ check_object_size(from, n, true);
+ n = __arch_copy_to_user(to, from, n);
+ }
return n;
}
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 7a5df5252dd7..06e6a5238c4c 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -18,11 +18,29 @@
#ifndef __ASM__VIRT_H
#define __ASM__VIRT_H
+/*
+ * The arm64 hcall implementation uses x0 to specify the hcall type. A value
+ * less than 0xfff indicates a special hcall, such as get/set vector.
+ * Any other value is used as a pointer to the function to call.
+ */
+
+/* HVC_GET_VECTORS - Return the value of the vbar_el2 register. */
+#define HVC_GET_VECTORS 0
+
+/*
+ * HVC_SET_VECTORS - Set the value of the vbar_el2 register.
+ *
+ * @x1: Physical address of the new vector table.
+ */
+#define HVC_SET_VECTORS 1
+
#define BOOT_CPU_MODE_EL1 (0xe11)
#define BOOT_CPU_MODE_EL2 (0xe12)
#ifndef __ASSEMBLY__
+#include <asm/ptrace.h>
+
/*
* __boot_cpu_mode records what mode CPUs were booted in.
* A correctly-implemented bootloader must start all CPUs in the same mode:
@@ -50,6 +68,14 @@ static inline bool is_hyp_mode_mismatched(void)
return __boot_cpu_mode[0] != __boot_cpu_mode[1];
}
+static inline bool is_kernel_in_hyp_mode(void)
+{
+ u64 el;
+
+ asm("mrs %0, CurrentEL" : "=r" (el));
+ return el == CurrentEL_EL2;
+}
+
/* The section containing the hypervisor text */
extern char __hyp_text_start[];
extern char __hyp_text_end[];
diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h
index aab5bf09e9d9..2b79b8a89457 100644
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -16,6 +16,8 @@
#ifndef __ASM_WORD_AT_A_TIME_H
#define __ASM_WORD_AT_A_TIME_H
+#include <asm/uaccess.h>
+
#ifndef __AARCH64EB__
#include <linux/kernel.h>
@@ -81,10 +83,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr)
#endif
" b 2b\n"
" .popsection\n"
- " .pushsection __ex_table,\"a\"\n"
- " .align 3\n"
- " .quad 1b, 3b\n"
- " .popsection"
+ _ASM_EXTABLE(1b, 3b)
: "=&r" (ret), "=&r" (offset)
: "r" (addr), "Q" (*(unsigned long *)addr));
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 3378238b5d8b..d1ff83dfe5de 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -45,6 +45,7 @@
#define PSR_A_BIT 0x00000100
#define PSR_D_BIT 0x00000200
#define PSR_PAN_BIT 0x00400000
+#define PSR_UAO_BIT 0x00800000
#define PSR_Q_BIT 0x08000000
#define PSR_V_BIT 0x10000000
#define PSR_C_BIT 0x20000000
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 474691f8b13a..20bcc2db06bf 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,10 +14,10 @@ CFLAGS_REMOVE_return_address.o = -pg
arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
entry-fpsimd.o process.o ptrace.o setup.o signal.o \
sys.o stacktrace.o time.o traps.o io.o vdso.o \
- hyp-stub.o psci.o psci-call.o cpu_ops.o insn.o \
+ hyp-stub.o psci.o cpu_ops.o insn.o \
return_address.o cpuinfo.o cpu_errata.o \
cpufeature.o alternative.o cacheinfo.o \
- smp.o smp_spin_table.o topology.o
+ smp.o smp_spin_table.o topology.o smccc-call.o
extra-$(CONFIG_EFI) := efi-entry.o
@@ -26,10 +26,10 @@ $(obj)/%.stub.o: $(obj)/%.o FORCE
$(call if_changed,objcopy)
arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \
- sys_compat.o entry32.o \
- ../../arm/kernel/opcodes.o
+ sys_compat.o entry32.o
arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o
arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o
+arm64-obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o
arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
arm64-obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
@@ -41,8 +41,12 @@ arm64-obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o
arm64-obj-$(CONFIG_PCI) += pci.o
arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o
arm64-obj-$(CONFIG_ACPI) += acpi.o
+arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o
+arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o
+arm64-obj-$(CONFIG_PARAVIRT) += paravirt.o
-obj-y += $(arm64-obj-y) vdso/
+obj-y += $(arm64-obj-y) vdso/ probes/
obj-m += $(arm64-obj-m)
head-y := head.o
extra-y += $(head-y) vmlinux.lds
diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c
new file mode 100644
index 000000000000..4b1e5a7a98da
--- /dev/null
+++ b/arch/arm64/kernel/acpi_parking_protocol.c
@@ -0,0 +1,153 @@
+/*
+ * ARM64 ACPI Parking Protocol implementation
+ *
+ * Authors: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ * Mark Salter <msalter@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/acpi.h>
+#include <linux/types.h>
+
+#include <asm/cpu_ops.h>
+
+struct cpu_mailbox_entry {
+ phys_addr_t mailbox_addr;
+ u8 version;
+ u8 gic_cpu_id;
+};
+
+static struct cpu_mailbox_entry cpu_mailbox_entries[NR_CPUS];
+
+void __init acpi_set_mailbox_entry(int cpu,
+ struct acpi_madt_generic_interrupt *p)
+{
+ struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+
+ cpu_entry->mailbox_addr = p->parked_address;
+ cpu_entry->version = p->parking_version;
+ cpu_entry->gic_cpu_id = p->cpu_interface_number;
+}
+
+bool acpi_parking_protocol_valid(int cpu)
+{
+ struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+
+ return cpu_entry->mailbox_addr && cpu_entry->version;
+}
+
+static int acpi_parking_protocol_cpu_init(unsigned int cpu)
+{
+ pr_debug("%s: ACPI parked addr=%llx\n", __func__,
+ cpu_mailbox_entries[cpu].mailbox_addr);
+
+ return 0;
+}
+
+static int acpi_parking_protocol_cpu_prepare(unsigned int cpu)
+{
+ return 0;
+}
+
+struct parking_protocol_mailbox {
+ __le32 cpu_id;
+ __le32 reserved;
+ __le64 entry_point;
+};
+
+static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
+{
+ struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+ struct parking_protocol_mailbox __iomem *mailbox;
+ __le32 cpu_id;
+
+ /*
+ * Map mailbox memory with attribute device nGnRE (ie ioremap -
+ * this deviates from the parking protocol specifications since
+ * the mailboxes are required to be mapped nGnRnE; the attribute
+ * discrepancy is harmless insofar as the protocol specification
+ * is concerned).
+ * If the mailbox is mistakenly allocated in the linear mapping
+ * by FW ioremap will fail since the mapping will be prevented
+ * by the kernel (it clashes with the linear mapping attributes
+ * specifications).
+ */
+ mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox));
+ if (!mailbox)
+ return -EIO;
+
+ cpu_id = readl_relaxed(&mailbox->cpu_id);
+ /*
+ * Check if firmware has set-up the mailbox entry properly
+ * before kickstarting the respective cpu.
+ */
+ if (cpu_id != ~0U) {
+ iounmap(mailbox);
+ return -ENXIO;
+ }
+
+ /*
+ * We write the entry point and cpu id as LE regardless of the
+ * native endianness of the kernel. Therefore, any boot-loaders
+ * that read this address need to convert this address to the
+ * Boot-Loader's endianness before jumping.
+ */
+ writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point);
+ writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id);
+
+ arch_send_wakeup_ipi_mask(cpumask_of(cpu));
+
+ iounmap(mailbox);
+
+ return 0;
+}
+
+static void acpi_parking_protocol_cpu_postboot(void)
+{
+ int cpu = smp_processor_id();
+ struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+ struct parking_protocol_mailbox __iomem *mailbox;
+ __le64 entry_point;
+
+ /*
+ * Map mailbox memory with attribute device nGnRE (ie ioremap -
+ * this deviates from the parking protocol specifications since
+ * the mailboxes are required to be mapped nGnRnE; the attribute
+ * discrepancy is harmless insofar as the protocol specification
+ * is concerned).
+ * If the mailbox is mistakenly allocated in the linear mapping
+ * by FW ioremap will fail since the mapping will be prevented
+ * by the kernel (it clashes with the linear mapping attributes
+ * specifications).
+ */
+ mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox));
+ if (!mailbox)
+ return;
+
+ entry_point = readl_relaxed(&mailbox->entry_point);
+ /*
+ * Check if firmware has cleared the entry_point as expected
+ * by the protocol specification.
+ */
+ WARN_ON(entry_point);
+
+ iounmap(mailbox);
+}
+
+const struct cpu_operations acpi_parking_protocol_ops = {
+ .name = "parking-protocol",
+ .cpu_init = acpi_parking_protocol_cpu_init,
+ .cpu_prepare = acpi_parking_protocol_cpu_prepare,
+ .cpu_boot = acpi_parking_protocol_cpu_boot,
+ .cpu_postboot = acpi_parking_protocol_cpu_postboot
+};
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index ab9db0e9818c..d2ee1b21a10d 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -158,9 +158,3 @@ void apply_alternatives(void *start, size_t length)
__apply_alternatives(&region);
}
-
-void free_alternatives_memory(void)
-{
- free_reserved_area(__alt_instructions, __alt_instructions_end,
- 0, "alternatives");
-}
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 3b6d8cc9dfe0..ee97181e4477 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -26,6 +26,8 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/kprobes.h>
+#include <linux/arm-smccc.h>
#include <asm/checksum.h>
@@ -33,8 +35,8 @@ EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
/* user mem (segment) */
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(__arch_copy_from_user);
+EXPORT_SYMBOL(__arch_copy_to_user);
EXPORT_SYMBOL(__clear_user);
EXPORT_SYMBOL(__copy_in_user);
@@ -67,4 +69,9 @@ EXPORT_SYMBOL(test_and_change_bit);
#ifdef CONFIG_FUNCTION_TRACER
EXPORT_SYMBOL(_mcount);
+NOKPROBE_SYMBOL(_mcount);
#endif
+
+ /* arm-smccc */
+EXPORT_SYMBOL(arm_smccc_smc);
+EXPORT_SYMBOL(arm_smccc_hvc);
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 937f5e58a4d3..293489476529 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -62,7 +62,7 @@ struct insn_emulation {
};
static LIST_HEAD(insn_emulation);
-static int nr_insn_emulated;
+static int nr_insn_emulated __initdata;
static DEFINE_RAW_SPINLOCK(insn_emulation_lock);
static void register_emulation_hooks(struct insn_emulation_ops *ops)
@@ -173,7 +173,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn,
return ret;
}
-static void register_insn_emulation(struct insn_emulation_ops *ops)
+static void __init register_insn_emulation(struct insn_emulation_ops *ops)
{
unsigned long flags;
struct insn_emulation *insn;
@@ -237,7 +237,7 @@ static struct ctl_table ctl_abi[] = {
{ }
};
-static void register_insn_emulation_sysctl(struct ctl_table *table)
+static void __init register_insn_emulation_sysctl(struct ctl_table *table)
{
unsigned long flags;
int i = 0;
@@ -297,11 +297,8 @@ static void register_insn_emulation_sysctl(struct ctl_table *table)
"4: mov %w0, %w5\n" \
" b 3b\n" \
" .popsection" \
- " .pushsection __ex_table,\"a\"\n" \
- " .align 3\n" \
- " .quad 0b, 4b\n" \
- " .quad 1b, 4b\n" \
- " .popsection\n" \
+ _ASM_EXTABLE(0b, 4b) \
+ _ASM_EXTABLE(1b, 4b) \
ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
CONFIG_ARM64_PAN) \
: "=&r" (res), "+r" (data), "=&r" (temp) \
@@ -369,6 +366,21 @@ static int emulate_swpX(unsigned int address, unsigned int *data,
return res;
}
+#define ARM_OPCODE_CONDITION_UNCOND 0xf
+
+static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr)
+{
+ u32 cc_bits = opcode >> 28;
+
+ if (cc_bits != ARM_OPCODE_CONDITION_UNCOND) {
+ if ((*aarch32_opcode_cond_checks[cc_bits])(psr))
+ return ARM_OPCODE_CONDTEST_PASS;
+ else
+ return ARM_OPCODE_CONDTEST_FAIL;
+ }
+ return ARM_OPCODE_CONDTEST_UNCOND;
+}
+
/*
* swp_handler logs the id of calling process, dissects the instruction, sanity
* checks the memory location, calls emulate_swpX for the actual operation and
@@ -383,7 +395,7 @@ static int swp_handler(struct pt_regs *regs, u32 instr)
type = instr & TYPE_SWPB;
- switch (arm_check_condition(instr, regs->pstate)) {
+ switch (aarch32_check_condition(instr, regs->pstate)) {
case ARM_OPCODE_CONDTEST_PASS:
break;
case ARM_OPCODE_CONDTEST_FAIL:
@@ -464,7 +476,7 @@ static int cp15barrier_handler(struct pt_regs *regs, u32 instr)
{
perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->pc);
- switch (arm_check_condition(instr, regs->pstate)) {
+ switch (aarch32_check_condition(instr, regs->pstate)) {
case ARM_OPCODE_CONDTEST_PASS:
break;
case ARM_OPCODE_CONDTEST_FAIL:
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 087cf9a65359..2bb17bd556f8 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -22,12 +22,14 @@
#include <linux/mm.h>
#include <linux/dma-mapping.h>
#include <linux/kvm_host.h>
+#include <linux/suspend.h>
#include <asm/thread_info.h>
#include <asm/memory.h>
#include <asm/smp_plat.h>
#include <asm/suspend.h>
#include <asm/vdso_datapage.h>
#include <linux/kbuild.h>
+#include <linux/arm-smccc.h>
int main(void)
{
@@ -49,6 +51,17 @@ int main(void)
DEFINE(S_X5, offsetof(struct pt_regs, regs[5]));
DEFINE(S_X6, offsetof(struct pt_regs, regs[6]));
DEFINE(S_X7, offsetof(struct pt_regs, regs[7]));
+ DEFINE(S_X8, offsetof(struct pt_regs, regs[8]));
+ DEFINE(S_X10, offsetof(struct pt_regs, regs[10]));
+ DEFINE(S_X12, offsetof(struct pt_regs, regs[12]));
+ DEFINE(S_X14, offsetof(struct pt_regs, regs[14]));
+ DEFINE(S_X16, offsetof(struct pt_regs, regs[16]));
+ DEFINE(S_X18, offsetof(struct pt_regs, regs[18]));
+ DEFINE(S_X20, offsetof(struct pt_regs, regs[20]));
+ DEFINE(S_X22, offsetof(struct pt_regs, regs[22]));
+ DEFINE(S_X24, offsetof(struct pt_regs, regs[24]));
+ DEFINE(S_X26, offsetof(struct pt_regs, regs[26]));
+ DEFINE(S_X28, offsetof(struct pt_regs, regs[28]));
DEFINE(S_LR, offsetof(struct pt_regs, regs[30]));
DEFINE(S_SP, offsetof(struct pt_regs, sp));
#ifdef CONFIG_COMPAT
@@ -109,58 +122,25 @@ int main(void)
DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs));
DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs));
DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs));
- DEFINE(CPU_SP_EL1, offsetof(struct kvm_regs, sp_el1));
- DEFINE(CPU_ELR_EL1, offsetof(struct kvm_regs, elr_el1));
- DEFINE(CPU_SPSR, offsetof(struct kvm_regs, spsr));
- DEFINE(CPU_SYSREGS, offsetof(struct kvm_cpu_context, sys_regs));
+ DEFINE(VCPU_FPEXC32_EL2, offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
DEFINE(VCPU_ESR_EL2, offsetof(struct kvm_vcpu, arch.fault.esr_el2));
DEFINE(VCPU_FAR_EL2, offsetof(struct kvm_vcpu, arch.fault.far_el2));
DEFINE(VCPU_HPFAR_EL2, offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
- DEFINE(VCPU_DEBUG_FLAGS, offsetof(struct kvm_vcpu, arch.debug_flags));
- DEFINE(VCPU_DEBUG_PTR, offsetof(struct kvm_vcpu, arch.debug_ptr));
- DEFINE(DEBUG_BCR, offsetof(struct kvm_guest_debug_arch, dbg_bcr));
- DEFINE(DEBUG_BVR, offsetof(struct kvm_guest_debug_arch, dbg_bvr));
- DEFINE(DEBUG_WCR, offsetof(struct kvm_guest_debug_arch, dbg_wcr));
- DEFINE(DEBUG_WVR, offsetof(struct kvm_guest_debug_arch, dbg_wvr));
- DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2));
- DEFINE(VCPU_MDCR_EL2, offsetof(struct kvm_vcpu, arch.mdcr_el2));
- DEFINE(VCPU_IRQ_LINES, offsetof(struct kvm_vcpu, arch.irq_lines));
DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context));
- DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
- DEFINE(VCPU_TIMER_CNTV_CTL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
- DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
- DEFINE(KVM_TIMER_CNTVOFF, offsetof(struct kvm, arch.timer.cntvoff));
- DEFINE(KVM_TIMER_ENABLED, offsetof(struct kvm, arch.timer.enabled));
- DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
- DEFINE(VCPU_VGIC_CPU, offsetof(struct kvm_vcpu, arch.vgic_cpu));
- DEFINE(VGIC_V2_CPU_HCR, offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
- DEFINE(VGIC_V2_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
- DEFINE(VGIC_V2_CPU_MISR, offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
- DEFINE(VGIC_V2_CPU_EISR, offsetof(struct vgic_cpu, vgic_v2.vgic_eisr));
- DEFINE(VGIC_V2_CPU_ELRSR, offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr));
- DEFINE(VGIC_V2_CPU_APR, offsetof(struct vgic_cpu, vgic_v2.vgic_apr));
- DEFINE(VGIC_V2_CPU_LR, offsetof(struct vgic_cpu, vgic_v2.vgic_lr));
- DEFINE(VGIC_V3_CPU_SRE, offsetof(struct vgic_cpu, vgic_v3.vgic_sre));
- DEFINE(VGIC_V3_CPU_HCR, offsetof(struct vgic_cpu, vgic_v3.vgic_hcr));
- DEFINE(VGIC_V3_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v3.vgic_vmcr));
- DEFINE(VGIC_V3_CPU_MISR, offsetof(struct vgic_cpu, vgic_v3.vgic_misr));
- DEFINE(VGIC_V3_CPU_EISR, offsetof(struct vgic_cpu, vgic_v3.vgic_eisr));
- DEFINE(VGIC_V3_CPU_ELRSR, offsetof(struct vgic_cpu, vgic_v3.vgic_elrsr));
- DEFINE(VGIC_V3_CPU_AP0R, offsetof(struct vgic_cpu, vgic_v3.vgic_ap0r));
- DEFINE(VGIC_V3_CPU_AP1R, offsetof(struct vgic_cpu, vgic_v3.vgic_ap1r));
- DEFINE(VGIC_V3_CPU_LR, offsetof(struct vgic_cpu, vgic_v3.vgic_lr));
- DEFINE(VGIC_CPU_NR_LR, offsetof(struct vgic_cpu, nr_lr));
- DEFINE(KVM_VTTBR, offsetof(struct kvm, arch.vttbr));
- DEFINE(KVM_VGIC_VCTRL, offsetof(struct kvm, arch.vgic.vctrl_base));
#endif
#ifdef CONFIG_CPU_PM
DEFINE(CPU_SUSPEND_SZ, sizeof(struct cpu_suspend_ctx));
DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));
DEFINE(MPIDR_HASH_MASK, offsetof(struct mpidr_hash, mask));
DEFINE(MPIDR_HASH_SHIFTS, offsetof(struct mpidr_hash, shift_aff));
- DEFINE(SLEEP_SAVE_SP_SZ, sizeof(struct sleep_save_sp));
- DEFINE(SLEEP_SAVE_SP_PHYS, offsetof(struct sleep_save_sp, save_ptr_stash_phys));
- DEFINE(SLEEP_SAVE_SP_VIRT, offsetof(struct sleep_save_sp, save_ptr_stash));
+ DEFINE(SLEEP_STACK_DATA_SYSTEM_REGS, offsetof(struct sleep_stack_data, system_regs));
+ DEFINE(SLEEP_STACK_DATA_CALLEE_REGS, offsetof(struct sleep_stack_data, callee_saved_regs));
#endif
+ DEFINE(ARM_SMCCC_RES_X0_OFFS, offsetof(struct arm_smccc_res, a0));
+ DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2));
+ BLANK();
+ DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address));
+ DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address));
+ DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next));
return 0;
}
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index a3e846a28b05..06afd04e02c0 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -21,24 +21,12 @@
#include <asm/cputype.h>
#include <asm/cpufeature.h>
-#define MIDR_CORTEX_A53 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
-#define MIDR_CORTEX_A57 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
-#define MIDR_THUNDERX MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
-
-#define CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \
- MIDR_ARCHITECTURE_MASK)
-
static bool __maybe_unused
is_affected_midr_range(const struct arm64_cpu_capabilities *entry)
{
- u32 midr = read_cpuid_id();
-
- if ((midr & CPU_MODEL_MASK) != entry->midr_model)
- return false;
-
- midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK;
-
- return (midr >= entry->midr_range_min && midr <= entry->midr_range_max);
+ return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model,
+ entry->midr_range_min,
+ entry->midr_range_max);
}
#define MIDR_RANGE(model, min, max) \
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
index b6bd7d447768..c7cfb8fe06f9 100644
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -25,19 +25,30 @@
#include <asm/smp_plat.h>
extern const struct cpu_operations smp_spin_table_ops;
+extern const struct cpu_operations acpi_parking_protocol_ops;
extern const struct cpu_operations cpu_psci_ops;
const struct cpu_operations *cpu_ops[NR_CPUS];
-static const struct cpu_operations *supported_cpu_ops[] __initconst = {
+static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = {
&smp_spin_table_ops,
&cpu_psci_ops,
NULL,
};
+static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = {
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+ &acpi_parking_protocol_ops,
+#endif
+ &cpu_psci_ops,
+ NULL,
+};
+
static const struct cpu_operations * __init cpu_get_ops(const char *name)
{
- const struct cpu_operations **ops = supported_cpu_ops;
+ const struct cpu_operations **ops;
+
+ ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops;
while (*ops) {
if (!strcmp(name, (*ops)->name))
@@ -75,8 +86,16 @@ static const char *__init cpu_read_enable_method(int cpu)
}
} else {
enable_method = acpi_get_enable_method(cpu);
- if (!enable_method)
- pr_err("Unsupported ACPI enable-method\n");
+ if (!enable_method) {
+ /*
+ * In ACPI systems the boot CPU does not require
+ * checking the enable method since for some
+ * boot protocol (ie parking protocol) it need not
+ * be initialized. Don't warn spuriously.
+ */
+ if (cpu != 0)
+ pr_err("Unsupported ACPI enable-method\n");
+ }
}
return enable_method;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 2735bf814592..eda7d5915fbb 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -28,6 +28,7 @@
#include <asm/cpu_ops.h>
#include <asm/processor.h>
#include <asm/sysreg.h>
+#include <asm/virt.h>
unsigned long elf_hwcap __read_mostly;
EXPORT_SYMBOL_GPL(elf_hwcap);
@@ -69,6 +70,10 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
.width = 0, \
}
+/* meta feature for alternatives */
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry);
+
static struct arm64_ftr_bits ftr_id_aa64isar0[] = {
ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0),
@@ -125,6 +130,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
ARM64_FTR_END,
};
+static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
static struct arm64_ftr_bits ftr_ctr[] = {
U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */
ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 3, 0),
@@ -286,6 +296,7 @@ static struct arm64_ftr_reg arm64_ftr_regs[] = {
/* Op1 = 0, CRn = 0, CRm = 7 */
ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1),
+ ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2),
/* Op1 = 3, CRn = 0, CRm = 0 */
ARM64_FTR_REG(SYS_CTR_EL0, ftr_ctr),
@@ -410,6 +421,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
+ init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
@@ -519,6 +531,8 @@ void update_cpu_features(int cpu,
info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0);
taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu,
info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1);
+ taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu,
+ info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2);
/*
* EL3 is not our concern.
@@ -623,6 +637,23 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry)
return has_sre;
}
+static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry)
+{
+ u32 midr = read_cpuid_id();
+ u32 rv_min, rv_max;
+
+ /* Cavium ThunderX pass 1.x and 2.x */
+ rv_min = 0;
+ rv_max = (1 << MIDR_VARIANT_SHIFT) | MIDR_REVISION_MASK;
+
+ return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max);
+}
+
+static bool runs_at_el2(const struct arm64_cpu_capabilities *entry)
+{
+ return is_kernel_in_hyp_mode();
+}
+
static const struct arm64_cpu_capabilities arm64_features[] = {
{
.desc = "GIC system register CPU interface",
@@ -653,6 +684,33 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.min_field_value = 2,
},
#endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */
+ {
+ .desc = "Software prefetching using PRFM",
+ .capability = ARM64_HAS_NO_HW_PREFETCH,
+ .matches = has_no_hw_prefetch,
+ },
+#ifdef CONFIG_ARM64_UAO
+ {
+ .desc = "User Access Override",
+ .capability = ARM64_HAS_UAO,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64MMFR2_EL1,
+ .field_pos = ID_AA64MMFR2_UAO_SHIFT,
+ .min_field_value = 1,
+ .enable = cpu_enable_uao,
+ },
+#endif /* CONFIG_ARM64_UAO */
+#ifdef CONFIG_ARM64_PAN
+ {
+ .capability = ARM64_ALT_PAN_NOT_UAO,
+ .matches = cpufeature_pan_not_uao,
+ },
+#endif /* CONFIG_ARM64_PAN */
+ {
+ .desc = "Virtualization Host Extensions",
+ .capability = ARM64_HAS_VIRT_HOST_EXTN,
+ .matches = runs_at_el2,
+ },
{},
};
@@ -686,7 +744,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = {
{},
};
-static void cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
+static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
{
switch (cap->hwcap_type) {
case CAP_HWCAP:
@@ -731,12 +789,12 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities *
return rc;
}
-static void setup_cpu_hwcaps(void)
+static void __init setup_cpu_hwcaps(void)
{
int i;
const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps;
- for (i = 0; hwcaps[i].desc; i++)
+ for (i = 0; hwcaps[i].matches; i++)
if (hwcaps[i].matches(&hwcaps[i]))
cap_set_hwcap(&hwcaps[i]);
}
@@ -746,11 +804,11 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
{
int i;
- for (i = 0; caps[i].desc; i++) {
+ for (i = 0; caps[i].matches; i++) {
if (!caps[i].matches(&caps[i]))
continue;
- if (!cpus_have_cap(caps[i].capability))
+ if (!cpus_have_cap(caps[i].capability) && caps[i].desc)
pr_info("%s %s\n", info, caps[i].desc);
cpus_set_cap(caps[i].capability);
}
@@ -760,11 +818,12 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
* Run through the enabled capabilities and enable() it on all active
* CPUs
*/
-static void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
+static void __init
+enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
{
int i;
- for (i = 0; caps[i].desc; i++)
+ for (i = 0; caps[i].matches; i++)
if (caps[i].enable && cpus_have_cap(caps[i].capability))
/*
* Use stop_machine() as it schedules the work allowing
@@ -798,35 +857,36 @@ static inline void set_sys_caps_initialised(void)
static u64 __raw_read_system_reg(u32 sys_id)
{
switch (sys_id) {
- case SYS_ID_PFR0_EL1: return (u64)read_cpuid(ID_PFR0_EL1);
- case SYS_ID_PFR1_EL1: return (u64)read_cpuid(ID_PFR1_EL1);
- case SYS_ID_DFR0_EL1: return (u64)read_cpuid(ID_DFR0_EL1);
- case SYS_ID_MMFR0_EL1: return (u64)read_cpuid(ID_MMFR0_EL1);
- case SYS_ID_MMFR1_EL1: return (u64)read_cpuid(ID_MMFR1_EL1);
- case SYS_ID_MMFR2_EL1: return (u64)read_cpuid(ID_MMFR2_EL1);
- case SYS_ID_MMFR3_EL1: return (u64)read_cpuid(ID_MMFR3_EL1);
- case SYS_ID_ISAR0_EL1: return (u64)read_cpuid(ID_ISAR0_EL1);
- case SYS_ID_ISAR1_EL1: return (u64)read_cpuid(ID_ISAR1_EL1);
- case SYS_ID_ISAR2_EL1: return (u64)read_cpuid(ID_ISAR2_EL1);
- case SYS_ID_ISAR3_EL1: return (u64)read_cpuid(ID_ISAR3_EL1);
- case SYS_ID_ISAR4_EL1: return (u64)read_cpuid(ID_ISAR4_EL1);
- case SYS_ID_ISAR5_EL1: return (u64)read_cpuid(ID_ISAR4_EL1);
- case SYS_MVFR0_EL1: return (u64)read_cpuid(MVFR0_EL1);
- case SYS_MVFR1_EL1: return (u64)read_cpuid(MVFR1_EL1);
- case SYS_MVFR2_EL1: return (u64)read_cpuid(MVFR2_EL1);
-
- case SYS_ID_AA64PFR0_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1);
- case SYS_ID_AA64PFR1_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1);
- case SYS_ID_AA64DFR0_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1);
- case SYS_ID_AA64DFR1_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1);
- case SYS_ID_AA64MMFR0_EL1: return (u64)read_cpuid(ID_AA64MMFR0_EL1);
- case SYS_ID_AA64MMFR1_EL1: return (u64)read_cpuid(ID_AA64MMFR1_EL1);
- case SYS_ID_AA64ISAR0_EL1: return (u64)read_cpuid(ID_AA64ISAR0_EL1);
- case SYS_ID_AA64ISAR1_EL1: return (u64)read_cpuid(ID_AA64ISAR1_EL1);
-
- case SYS_CNTFRQ_EL0: return (u64)read_cpuid(CNTFRQ_EL0);
- case SYS_CTR_EL0: return (u64)read_cpuid(CTR_EL0);
- case SYS_DCZID_EL0: return (u64)read_cpuid(DCZID_EL0);
+ case SYS_ID_PFR0_EL1: return read_cpuid(SYS_ID_PFR0_EL1);
+ case SYS_ID_PFR1_EL1: return read_cpuid(SYS_ID_PFR1_EL1);
+ case SYS_ID_DFR0_EL1: return read_cpuid(SYS_ID_DFR0_EL1);
+ case SYS_ID_MMFR0_EL1: return read_cpuid(SYS_ID_MMFR0_EL1);
+ case SYS_ID_MMFR1_EL1: return read_cpuid(SYS_ID_MMFR1_EL1);
+ case SYS_ID_MMFR2_EL1: return read_cpuid(SYS_ID_MMFR2_EL1);
+ case SYS_ID_MMFR3_EL1: return read_cpuid(SYS_ID_MMFR3_EL1);
+ case SYS_ID_ISAR0_EL1: return read_cpuid(SYS_ID_ISAR0_EL1);
+ case SYS_ID_ISAR1_EL1: return read_cpuid(SYS_ID_ISAR1_EL1);
+ case SYS_ID_ISAR2_EL1: return read_cpuid(SYS_ID_ISAR2_EL1);
+ case SYS_ID_ISAR3_EL1: return read_cpuid(SYS_ID_ISAR3_EL1);
+ case SYS_ID_ISAR4_EL1: return read_cpuid(SYS_ID_ISAR4_EL1);
+ case SYS_ID_ISAR5_EL1: return read_cpuid(SYS_ID_ISAR4_EL1);
+ case SYS_MVFR0_EL1: return read_cpuid(SYS_MVFR0_EL1);
+ case SYS_MVFR1_EL1: return read_cpuid(SYS_MVFR1_EL1);
+ case SYS_MVFR2_EL1: return read_cpuid(SYS_MVFR2_EL1);
+
+ case SYS_ID_AA64PFR0_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1);
+ case SYS_ID_AA64PFR1_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1);
+ case SYS_ID_AA64DFR0_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1);
+ case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1);
+ case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1);
+ case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1);
+ case SYS_ID_AA64MMFR2_EL1: return read_cpuid(SYS_ID_AA64MMFR2_EL1);
+ case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1);
+ case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1);
+
+ case SYS_CNTFRQ_EL0: return read_cpuid(SYS_CNTFRQ_EL0);
+ case SYS_CTR_EL0: return read_cpuid(SYS_CTR_EL0);
+ case SYS_DCZID_EL0: return read_cpuid(SYS_DCZID_EL0);
default:
BUG();
return 0;
@@ -876,7 +936,7 @@ void verify_local_cpu_capabilities(void)
return;
caps = arm64_features;
- for (i = 0; caps[i].desc; i++) {
+ for (i = 0; caps[i].matches; i++) {
if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg)
continue;
/*
@@ -889,7 +949,7 @@ void verify_local_cpu_capabilities(void)
caps[i].enable(NULL);
}
- for (i = 0, caps = arm64_hwcaps; caps[i].desc; i++) {
+ for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) {
if (!cpus_have_hwcap(&caps[i]))
continue;
if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i]))
@@ -905,7 +965,7 @@ static inline void set_sys_caps_initialised(void)
#endif /* CONFIG_HOTPLUG_CPU */
-static void setup_feature_capabilities(void)
+static void __init setup_feature_capabilities(void)
{
update_cpu_capabilities(arm64_features, "detected feature:");
enable_cpu_capabilities(arm64_features);
@@ -935,3 +995,9 @@ void __init setup_cpu_features(void)
pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
L1_CACHE_BYTES, cls);
}
+
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry)
+{
+ return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO));
+}
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 0166cfbc866c..95a6fae54740 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -208,35 +208,36 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
{
info->reg_cntfrq = arch_timer_get_cntfrq();
info->reg_ctr = read_cpuid_cachetype();
- info->reg_dczid = read_cpuid(DCZID_EL0);
+ info->reg_dczid = read_cpuid(SYS_DCZID_EL0);
info->reg_midr = read_cpuid_id();
- info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1);
- info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1);
- info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
- info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
- info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
- info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
- info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
- info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
-
- info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
- info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
- info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
- info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
- info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
- info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
- info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
- info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
- info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
- info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
- info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
- info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
- info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
-
- info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
- info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
- info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
+ info->reg_id_aa64dfr0 = read_cpuid(SYS_ID_AA64DFR0_EL1);
+ info->reg_id_aa64dfr1 = read_cpuid(SYS_ID_AA64DFR1_EL1);
+ info->reg_id_aa64isar0 = read_cpuid(SYS_ID_AA64ISAR0_EL1);
+ info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1);
+ info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1);
+ info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1);
+ info->reg_id_aa64mmfr2 = read_cpuid(SYS_ID_AA64MMFR2_EL1);
+ info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1);
+ info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1);
+
+ info->reg_id_dfr0 = read_cpuid(SYS_ID_DFR0_EL1);
+ info->reg_id_isar0 = read_cpuid(SYS_ID_ISAR0_EL1);
+ info->reg_id_isar1 = read_cpuid(SYS_ID_ISAR1_EL1);
+ info->reg_id_isar2 = read_cpuid(SYS_ID_ISAR2_EL1);
+ info->reg_id_isar3 = read_cpuid(SYS_ID_ISAR3_EL1);
+ info->reg_id_isar4 = read_cpuid(SYS_ID_ISAR4_EL1);
+ info->reg_id_isar5 = read_cpuid(SYS_ID_ISAR5_EL1);
+ info->reg_id_mmfr0 = read_cpuid(SYS_ID_MMFR0_EL1);
+ info->reg_id_mmfr1 = read_cpuid(SYS_ID_MMFR1_EL1);
+ info->reg_id_mmfr2 = read_cpuid(SYS_ID_MMFR2_EL1);
+ info->reg_id_mmfr3 = read_cpuid(SYS_ID_MMFR3_EL1);
+ info->reg_id_pfr0 = read_cpuid(SYS_ID_PFR0_EL1);
+ info->reg_id_pfr1 = read_cpuid(SYS_ID_PFR1_EL1);
+
+ info->reg_mvfr0 = read_cpuid(SYS_MVFR0_EL1);
+ info->reg_mvfr1 = read_cpuid(SYS_MVFR1_EL1);
+ info->reg_mvfr2 = read_cpuid(SYS_MVFR2_EL1);
cpuinfo_detect_icache_policy(info);
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index c8875b64be90..6de6d9f43b95 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -23,6 +23,7 @@
#include <linux/hardirq.h>
#include <linux/init.h>
#include <linux/ptrace.h>
+#include <linux/kprobes.h>
#include <linux/stat.h>
#include <linux/uaccess.h>
@@ -48,6 +49,7 @@ static void mdscr_write(u32 mdscr)
asm volatile("msr mdscr_el1, %0" :: "r" (mdscr));
local_dbg_restore(flags);
}
+NOKPROBE_SYMBOL(mdscr_write);
static u32 mdscr_read(void)
{
@@ -55,6 +57,7 @@ static u32 mdscr_read(void)
asm volatile("mrs %0, mdscr_el1" : "=r" (mdscr));
return mdscr;
}
+NOKPROBE_SYMBOL(mdscr_read);
/*
* Allow root to disable self-hosted debug from userspace.
@@ -103,6 +106,7 @@ void enable_debug_monitors(enum dbg_active_el el)
mdscr_write(mdscr);
}
}
+NOKPROBE_SYMBOL(enable_debug_monitors);
void disable_debug_monitors(enum dbg_active_el el)
{
@@ -123,6 +127,7 @@ void disable_debug_monitors(enum dbg_active_el el)
mdscr_write(mdscr);
}
}
+NOKPROBE_SYMBOL(disable_debug_monitors);
/*
* OS lock clearing.
@@ -173,6 +178,7 @@ static void set_regs_spsr_ss(struct pt_regs *regs)
spsr |= DBG_SPSR_SS;
regs->pstate = spsr;
}
+NOKPROBE_SYMBOL(set_regs_spsr_ss);
static void clear_regs_spsr_ss(struct pt_regs *regs)
{
@@ -182,6 +188,7 @@ static void clear_regs_spsr_ss(struct pt_regs *regs)
spsr &= ~DBG_SPSR_SS;
regs->pstate = spsr;
}
+NOKPROBE_SYMBOL(clear_regs_spsr_ss);
/* EL1 Single Step Handler hooks */
static LIST_HEAD(step_hook);
@@ -225,6 +232,7 @@ static int call_step_hook(struct pt_regs *regs, unsigned int esr)
return retval;
}
+NOKPROBE_SYMBOL(call_step_hook);
static int single_step_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
@@ -253,6 +261,10 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
*/
user_rewind_single_step(current);
} else {
+#ifdef CONFIG_KPROBES
+ if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED)
+ return 0;
+#endif
if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED)
return 0;
@@ -266,6 +278,7 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
return 0;
}
+NOKPROBE_SYMBOL(single_step_handler);
/*
* Breakpoint handler is re-entrant as another breakpoint can
@@ -303,6 +316,7 @@ static int call_break_hook(struct pt_regs *regs, unsigned int esr)
return fn ? fn(regs, esr) : DBG_HOOK_ERROR;
}
+NOKPROBE_SYMBOL(call_break_hook);
static int brk_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
@@ -318,13 +332,21 @@ static int brk_handler(unsigned long addr, unsigned int esr,
};
force_sig_info(SIGTRAP, &info, current);
- } else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) {
- pr_warning("Unexpected kernel BRK exception at EL1\n");
+ }
+#ifdef CONFIG_KPROBES
+ else if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) {
+ if (kprobe_breakpoint_handler(regs, esr) != DBG_HOOK_HANDLED)
+ return -EFAULT;
+ }
+#endif
+ else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) {
+ pr_warn("Unexpected kernel BRK exception at EL1\n");
return -EFAULT;
}
return 0;
}
+NOKPROBE_SYMBOL(brk_handler);
int aarch32_break_handler(struct pt_regs *regs)
{
@@ -369,6 +391,7 @@ int aarch32_break_handler(struct pt_regs *regs)
force_sig_info(SIGTRAP, &info, current);
return 0;
}
+NOKPROBE_SYMBOL(aarch32_break_handler);
static int __init debug_traps_init(void)
{
@@ -390,6 +413,7 @@ void user_rewind_single_step(struct task_struct *task)
if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP))
set_regs_spsr_ss(task_pt_regs(task));
}
+NOKPROBE_SYMBOL(user_rewind_single_step);
void user_fastforward_single_step(struct task_struct *task)
{
@@ -405,6 +429,7 @@ void kernel_enable_single_step(struct pt_regs *regs)
mdscr_write(mdscr_read() | DBG_MDSCR_SS);
enable_debug_monitors(DBG_ACTIVE_EL1);
}
+NOKPROBE_SYMBOL(kernel_enable_single_step);
void kernel_disable_single_step(void)
{
@@ -412,12 +437,14 @@ void kernel_disable_single_step(void)
mdscr_write(mdscr_read() & ~DBG_MDSCR_SS);
disable_debug_monitors(DBG_ACTIVE_EL1);
}
+NOKPROBE_SYMBOL(kernel_disable_single_step);
int kernel_active_single_step(void)
{
WARN_ON(!irqs_disabled());
return mdscr_read() & DBG_MDSCR_SS;
}
+NOKPROBE_SYMBOL(kernel_active_single_step);
/* ptrace API */
void user_enable_single_step(struct task_struct *task)
@@ -427,8 +454,10 @@ void user_enable_single_step(struct task_struct *task)
if (!test_and_set_ti_thread_flag(ti, TIF_SINGLESTEP))
set_regs_spsr_ss(task_pt_regs(task));
}
+NOKPROBE_SYMBOL(user_enable_single_step);
void user_disable_single_step(struct task_struct *task)
{
clear_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP);
}
+NOKPROBE_SYMBOL(user_disable_single_step);
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index a773db92908b..936022f0655e 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -61,7 +61,7 @@ ENTRY(entry)
*/
mov x20, x0 // DTB address
ldr x0, [sp, #16] // relocated _text address
- ldr x21, =stext_offset
+ ldr w21, =stext_offset
add x21, x0, x21
/*
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index bd14849beb73..a096d0980ade 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -27,6 +27,7 @@
#include <asm/cpufeature.h>
#include <asm/errno.h>
#include <asm/esr.h>
+#include <asm/irq.h>
#include <asm/memory.h>
#include <asm/thread_info.h>
#include <asm/unistd.h>
@@ -89,9 +90,12 @@
.if \el == 0
mrs x21, sp_el0
- get_thread_info tsk // Ensure MDSCR_EL1.SS is clear,
+ mov tsk, sp
+ and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear,
ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug
disable_step_tsk x19, x20 // exceptions when scheduling.
+
+ mov x29, xzr // fp pointed to user-space
.else
add x21, sp, #S_FRAME_SIZE
get_thread_info tsk
@@ -115,6 +119,13 @@
.endif
/*
+ * Set sp_el0 to current thread_info.
+ */
+ .if \el == 0
+ msr sp_el0, tsk
+ .endif
+
+ /*
* Registers that may be useful after this macro is invoked:
*
* x21 - aborted SP
@@ -177,8 +188,44 @@ alternative_endif
.endm
.macro get_thread_info, rd
- mov \rd, sp
- and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack
+ mrs \rd, sp_el0
+ .endm
+
+ .macro irq_stack_entry
+ mov x19, sp // preserve the original sp
+
+ /*
+ * Compare sp with the current thread_info, if the top
+ * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
+ * should switch to the irq stack.
+ */
+ and x25, x19, #~(THREAD_SIZE - 1)
+ cmp x25, tsk
+ b.ne 9998f
+
+ this_cpu_ptr irq_stack, x25, x26
+ mov x26, #IRQ_STACK_START_SP
+ add x26, x25, x26
+
+ /* switch to the irq stack */
+ mov sp, x26
+
+ /*
+ * Add a dummy stack frame, this non-standard format is fixed up
+ * by unwind_frame()
+ */
+ stp x29, x19, [sp, #-16]!
+ mov x29, sp
+
+9998:
+ .endm
+
+ /*
+ * x19 should be preserved between irq_stack_entry and
+ * irq_stack_exit.
+ */
+ .macro irq_stack_exit
+ mov sp, x19
.endm
/*
@@ -196,10 +243,11 @@ tsk .req x28 // current thread_info
* Interrupt handling.
*/
.macro irq_handler
- adrp x1, handle_arch_irq
- ldr x1, [x1, #:lo12:handle_arch_irq]
+ ldr_l x1, handle_arch_irq
mov x0, sp
+ irq_stack_entry
blr x1
+ irq_stack_exit
.endm
.text
@@ -207,6 +255,7 @@ tsk .req x28 // current thread_info
/*
* Exception vectors.
*/
+ .pushsection ".entry.text", "ax"
.align 11
ENTRY(vectors)
@@ -371,10 +420,10 @@ el1_irq:
bl trace_hardirqs_off
#endif
+ get_thread_info tsk
irq_handler
#ifdef CONFIG_PREEMPT
- get_thread_info tsk
ldr w24, [tsk, #TI_PREEMPT] // get preempt count
cbnz w24, 1f // preempt count != 0
ldr x0, [tsk, #TI_FLAGS] // get flags
@@ -612,6 +661,8 @@ ENTRY(cpu_switch_to)
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
+ and x9, x9, #~(THREAD_SIZE - 1)
+ msr sp_el0, x9
ret
ENDPROC(cpu_switch_to)
@@ -639,14 +690,14 @@ ret_fast_syscall_trace:
work_pending:
tbnz x1, #TIF_NEED_RESCHED, work_resched
/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
- ldr x2, [sp, #S_PSTATE]
mov x0, sp // 'regs'
- tst x2, #PSR_MODE_MASK // user mode regs?
- b.ne no_work_pending // returning to kernel
enable_irq // enable interrupts for do_notify_resume()
bl do_notify_resume
b ret_to_user
work_resched:
+#ifdef CONFIG_TRACE_IRQFLAGS
+ bl trace_hardirqs_off // the IRQs are off here, inform the tracing code
+#endif
bl schedule
/*
@@ -658,7 +709,6 @@ ret_to_user:
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
enable_step_tsk x1, x2
-no_work_pending:
kernel_exit 0
ENDPROC(ret_to_user)
@@ -738,6 +788,8 @@ __ni_sys_trace:
bl do_ni_syscall
b __sys_trace_return
+ .popsection // .entry.text
+
/*
* Special system call wrappers.
*/
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 4c46c54a3ad7..acc1afd5c749 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -289,7 +289,7 @@ static struct notifier_block fpsimd_cpu_pm_notifier_block = {
.notifier_call = fpsimd_cpu_pm_notifier,
};
-static void fpsimd_pm_init(void)
+static void __init fpsimd_pm_init(void)
{
cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block);
}
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index c851be795080..ebecf9aa33d1 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -29,12 +29,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
/*
* Note:
- * Due to modules and __init, code can disappear and change,
- * we need to protect against faulting as well as code changing.
- * We do this by aarch64_insn_*() which use the probe_kernel_*().
- *
- * No lock is held here because all the modifications are run
- * through stop_machine().
+ * We are paranoid about modifying text, as if a bug were to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with aarch64_insn_*() which uses
+ * probe_kernel_*(), and make sure what we read is what we expected it
+ * to be before modifying it.
*/
if (validate) {
if (aarch64_insn_read((void *)pc, &replaced))
@@ -93,6 +92,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
return ftrace_modify_code(pc, old, new, true);
}
+void arch_ftrace_update_code(int command)
+{
+ ftrace_modify_all_code(command);
+}
+
int __init ftrace_dyn_arch_init(void)
{
return 0;
@@ -125,23 +129,20 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
* on other archs. It's unlikely on AArch64.
*/
old = *parent;
- *parent = return_hooker;
trace.func = self_addr;
trace.depth = current->curr_ret_stack + 1;
/* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace)) {
- *parent = old;
+ if (!ftrace_graph_entry(&trace))
return;
- }
err = ftrace_push_return_trace(old, self_addr, &trace.depth,
frame_pointer);
- if (err == -EBUSY) {
- *parent = old;
+ if (err == -EBUSY)
return;
- }
+ else
+ *parent = return_hooker;
}
#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 20ceb5edf7b8..029c466eaa4c 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -25,10 +25,12 @@
#include <linux/irqchip/arm-gic-v3.h>
#include <asm/assembler.h>
+#include <asm/boot.h>
#include <asm/ptrace.h>
#include <asm/asm-offsets.h>
#include <asm/cache.h>
#include <asm/cputype.h>
+#include <asm/elf.h>
#include <asm/kernel-pgtable.h>
#include <asm/memory.h>
#include <asm/pgtable-hwdef.h>
@@ -48,9 +50,6 @@
#error TEXT_OFFSET must be less than 2MB
#endif
-#define KERNEL_START _text
-#define KERNEL_END _end
-
/*
* Kernel startup entry point.
* ---------------------------
@@ -67,12 +66,11 @@
* in the entry routines.
*/
__HEAD
-
+_head:
/*
* DO NOT MODIFY. Image header expected by Linux boot-loaders.
*/
#ifdef CONFIG_EFI
-efi_head:
/*
* This add instruction has no meaningful effect except that
* its opcode forms the magic "MZ" signature required by UEFI.
@@ -83,9 +81,9 @@ efi_head:
b stext // branch to kernel start, magic
.long 0 // reserved
#endif
- .quad _kernel_offset_le // Image load offset from start of RAM, little-endian
- .quad _kernel_size_le // Effective size of kernel image, little-endian
- .quad _kernel_flags_le // Informative flags, little-endian
+ le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian
+ le64sym _kernel_size_le // Effective size of kernel image, little-endian
+ le64sym _kernel_flags_le // Informative flags, little-endian
.quad 0 // reserved
.quad 0 // reserved
.quad 0 // reserved
@@ -94,14 +92,12 @@ efi_head:
.byte 0x4d
.byte 0x64
#ifdef CONFIG_EFI
- .long pe_header - efi_head // Offset to the PE header.
+ .long pe_header - _head // Offset to the PE header.
#else
.word 0 // reserved
#endif
#ifdef CONFIG_EFI
- .globl __efistub_stext_offset
- .set __efistub_stext_offset, stext - efi_head
.align 3
pe_header:
.ascii "PE"
@@ -121,11 +117,11 @@ optional_header:
.short 0x20b // PE32+ format
.byte 0x02 // MajorLinkerVersion
.byte 0x14 // MinorLinkerVersion
- .long _end - stext // SizeOfCode
+ .long _end - efi_header_end // SizeOfCode
.long 0 // SizeOfInitializedData
.long 0 // SizeOfUninitializedData
- .long __efistub_entry - efi_head // AddressOfEntryPoint
- .long __efistub_stext_offset // BaseOfCode
+ .long __efistub_entry - _head // AddressOfEntryPoint
+ .long efi_header_end - _head // BaseOfCode
extra_header_fields:
.quad 0 // ImageBase
@@ -139,10 +135,10 @@ extra_header_fields:
.short 0 // MinorSubsystemVersion
.long 0 // Win32VersionValue
- .long _end - efi_head // SizeOfImage
+ .long _end - _head // SizeOfImage
// Everything before the kernel image is considered part of the header
- .long __efistub_stext_offset // SizeOfHeaders
+ .long efi_header_end - _head // SizeOfHeaders
.long 0 // CheckSum
.short 0xa // Subsystem (EFI application)
.short 0 // DllCharacteristics
@@ -186,10 +182,10 @@ section_table:
.byte 0
.byte 0
.byte 0 // end of 0 padding of section name
- .long _end - stext // VirtualSize
- .long __efistub_stext_offset // VirtualAddress
- .long _edata - stext // SizeOfRawData
- .long __efistub_stext_offset // PointerToRawData
+ .long _end - efi_header_end // VirtualSize
+ .long efi_header_end - _head // VirtualAddress
+ .long _edata - efi_header_end // SizeOfRawData
+ .long efi_header_end - _head // PointerToRawData
.long 0 // PointerToRelocations (0 for executables)
.long 0 // PointerToLineNumbers (0 for executables)
@@ -198,19 +194,23 @@ section_table:
.long 0xe0500020 // Characteristics (section flags)
/*
- * EFI will load stext onwards at the 4k section alignment
+ * EFI will load .text onwards at the 4k section alignment
* described in the PE/COFF header. To ensure that instruction
* sequences using an adrp and a :lo12: immediate will function
- * correctly at this alignment, we must ensure that stext is
+ * correctly at this alignment, we must ensure that .text is
* placed at a 4k boundary in the Image to begin with.
*/
.align 12
+efi_header_end:
#endif
+ __INIT
+
ENTRY(stext)
bl preserve_boot_args
bl el2_setup // Drop to EL1, w20=cpu_boot_mode
adrp x24, __PHYS_OFFSET
+ and x23, x24, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0
bl set_cpu_boot_mode_flag
bl __create_page_tables // x25=TTBR0, x26=TTBR1
/*
@@ -219,10 +219,10 @@ ENTRY(stext)
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
- ldr x27, =__mmap_switched // address to jump to after
+ bl __cpu_setup // initialise processor
+ adr_l x27, __primary_switch // address to jump to after
// MMU has been enabled
- adr_l lr, __enable_mmu // return (PIC) address
- b __cpu_setup // initialise processor
+ b __enable_mmu
ENDPROC(stext)
/*
@@ -311,7 +311,7 @@ ENDPROC(preserve_boot_args)
__create_page_tables:
adrp x25, idmap_pg_dir
adrp x26, swapper_pg_dir
- mov x27, lr
+ mov x28, lr
/*
* Invalidate the idmap and swapper page tables to avoid potential
@@ -333,7 +333,7 @@ __create_page_tables:
cmp x0, x6
b.lo 1b
- ldr x7, =SWAPPER_MM_MMUFLAGS
+ mov x7, SWAPPER_MM_MMUFLAGS
/*
* Create the identity mapping.
@@ -389,10 +389,13 @@ __create_page_tables:
* Map the kernel image (starting with PHYS_OFFSET).
*/
mov x0, x26 // swapper_pg_dir
- mov x5, #PAGE_OFFSET
+ mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text)
+ add x5, x5, x23 // add KASLR displacement
create_pgd_entry x0, x5, x3, x6
- ldr x6, =KERNEL_END // __va(KERNEL_END)
- mov x3, x24 // phys offset
+ adrp x6, _end // runtime __pa(_end)
+ adrp x3, _text // runtime __pa(_text)
+ sub x6, x6, x3 // _end - _text
+ add x6, x6, x5 // runtime __va(_end)
create_block_map x0, x7, x3, x5, x6
/*
@@ -405,8 +408,7 @@ __create_page_tables:
dmb sy
bl __inval_cache_range
- mov lr, x27
- ret
+ ret x28
ENDPROC(__create_page_tables)
.ltorg
@@ -414,30 +416,58 @@ ENDPROC(__create_page_tables)
* The following fragment of code is executed with the MMU enabled.
*/
.set initial_sp, init_thread_union + THREAD_START_SP
-__mmap_switched:
- adr_l x6, __bss_start
- adr_l x7, __bss_stop
-
-1: cmp x6, x7
- b.hs 2f
- str xzr, [x6], #8 // Clear BSS
- b 1b
-2:
+__primary_switched:
+ mov x28, lr // preserve LR
+ adr_l x8, vectors // load VBAR_EL1 with virtual
+ msr vbar_el1, x8 // vector table address
+ isb
+
+ // Clear BSS
+ adr_l x0, __bss_start
+ mov x1, xzr
+ adr_l x2, __bss_stop
+ sub x2, x2, x0
+ bl __pi_memset
+ dsb ishst // Make zero page visible to PTW
+
adr_l sp, initial_sp, x4
+ mov x4, sp
+ and x4, x4, #~(THREAD_SIZE - 1)
+ msr sp_el0, x4 // Save thread_info
str_l x21, __fdt_pointer, x5 // Save FDT pointer
- str_l x24, memstart_addr, x6 // Save PHYS_OFFSET
+
+ ldr_l x4, kimage_vaddr // Save the offset between
+ sub x4, x4, x24 // the kernel virtual and
+ str_l x4, kimage_voffset, x5 // physical mappings
+
mov x29, #0
#ifdef CONFIG_KASAN
bl kasan_early_init
#endif
+#ifdef CONFIG_RANDOMIZE_BASE
+ tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
+ b.ne 0f
+ mov x0, x21 // pass FDT address in x0
+ mov x1, x23 // pass modulo offset in x1
+ bl kaslr_early_init // parse FDT for KASLR options
+ cbz x0, 0f // KASLR disabled? just proceed
+ orr x23, x23, x0 // record KASLR offset
+ ret x28 // we must enable KASLR, return
+ // to __enable_mmu()
+0:
+#endif
b start_kernel
-ENDPROC(__mmap_switched)
+ENDPROC(__primary_switched)
/*
* end early head section, begin head code that is also used for
* hotplug and needs to have the same protections as the text region
*/
.section ".text","ax"
+
+ENTRY(kimage_vaddr)
+ .quad _text - TEXT_OFFSET
+
/*
* If we're fortunate enough to boot at EL2, ensure that the world is
* sane before dropping to EL1.
@@ -543,7 +573,7 @@ ENDPROC(el2_setup)
* Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
* in x20. See arch/arm64/include/asm/virt.h for more info.
*/
-ENTRY(set_cpu_boot_mode_flag)
+set_cpu_boot_mode_flag:
adr_l x1, __boot_cpu_mode
cmp w20, #BOOT_CPU_MODE_EL2
b.ne 1f
@@ -576,7 +606,7 @@ ENTRY(secondary_holding_pen)
bl el2_setup // Drop to EL1, w20=cpu_boot_mode
bl set_cpu_boot_mode_flag
mrs x0, mpidr_el1
- ldr x1, =MPIDR_HWID_BITMASK
+ mov_q x1, MPIDR_HWID_BITMASK
and x0, x0, x1
adr_l x3, secondary_holding_pen_release
pen: ldr x4, [x3]
@@ -596,7 +626,7 @@ ENTRY(secondary_entry)
b secondary_startup
ENDPROC(secondary_entry)
-ENTRY(secondary_startup)
+secondary_startup:
/*
* Common entry point for secondary CPUs.
*/
@@ -604,14 +634,19 @@ ENTRY(secondary_startup)
adrp x26, swapper_pg_dir
bl __cpu_setup // initialise processor
- ldr x21, =secondary_data
- ldr x27, =__secondary_switched // address to jump to after enabling the MMU
+ adr_l x27, __secondary_switch // address to jump to after enabling the MMU
b __enable_mmu
ENDPROC(secondary_startup)
-ENTRY(__secondary_switched)
- ldr x0, [x21] // get secondary_data.stack
+__secondary_switched:
+ adr_l x5, vectors
+ msr vbar_el1, x5
+ isb
+
+ ldr_l x0, secondary_data // get secondary_data.stack
mov sp, x0
+ and x0, x0, #~(THREAD_SIZE - 1)
+ msr sp_el0, x0 // save thread_info
mov x29, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
@@ -628,13 +663,12 @@ ENDPROC(__secondary_switched)
* If it isn't, park the CPU
*/
.section ".idmap.text", "ax"
-__enable_mmu:
+ENTRY(__enable_mmu)
+ mrs x18, sctlr_el1 // preserve old SCTLR_EL1 value
mrs x1, ID_AA64MMFR0_EL1
ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4
cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
b.ne __no_granule_support
- ldr x5, =vectors
- msr vbar_el1, x5
msr ttbr0_el1, x25 // load TTBR0
msr ttbr1_el1, x26 // load TTBR1
isb
@@ -648,6 +682,25 @@ __enable_mmu:
ic iallu
dsb nsh
isb
+#ifdef CONFIG_RANDOMIZE_BASE
+ mov x19, x0 // preserve new SCTLR_EL1 value
+ blr x27
+
+ /*
+ * If we return here, we have a KASLR displacement in x23 which we need
+ * to take into account by discarding the current kernel mapping and
+ * creating a new one.
+ */
+ msr sctlr_el1, x18 // disable the MMU
+ isb
+ bl __create_page_tables // recreate kernel mapping
+
+ msr sctlr_el1, x19 // re-enable the MMU
+ isb
+ ic iallu // flush instructions fetched
+ dsb nsh // via old mapping
+ isb
+#endif
br x27
ENDPROC(__enable_mmu)
@@ -655,3 +708,38 @@ __no_granule_support:
wfe
b __no_granule_support
ENDPROC(__no_granule_support)
+
+__primary_switch:
+#ifdef CONFIG_RELOCATABLE
+ /*
+ * Iterate over each entry in the relocation table, and apply the
+ * relocations in place.
+ */
+ ldr w9, =__rela_offset // offset to reloc table
+ ldr w10, =__rela_size // size of reloc table
+
+ mov_q x11, KIMAGE_VADDR // default virtual offset
+ add x11, x11, x23 // actual virtual offset
+ add x9, x9, x11 // __va(.rela)
+ add x10, x9, x10 // __va(.rela) + sizeof(.rela)
+
+0: cmp x9, x10
+ b.hs 1f
+ ldp x11, x12, [x9], #24
+ ldr x13, [x9, #-8]
+ cmp w12, #R_AARCH64_RELATIVE
+ b.ne 0b
+ add x13, x13, x23 // relocate
+ str x13, [x11, x23]
+ b 0b
+
+1:
+#endif
+ ldr x8, =__primary_switched
+ br x8
+ENDPROC(__primary_switch)
+
+__secondary_switch:
+ ldr x8, =__secondary_switched
+ br x8
+ENDPROC(__secondary_switch)
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
new file mode 100644
index 000000000000..46f29b6560ec
--- /dev/null
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -0,0 +1,176 @@
+/*
+ * Hibernate low-level support
+ *
+ * Copyright (C) 2016 ARM Ltd.
+ * Author: James Morse <james.morse@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+#include <asm/cputype.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+#include <asm/virt.h>
+
+/*
+ * To prevent the possibility of old and new partial table walks being visible
+ * in the tlb, switch the ttbr to a zero page when we invalidate the old
+ * records. D4.7.1 'General TLB maintenance requirements' in ARM DDI 0487A.i
+ * Even switching to our copied tables will cause a changed output address at
+ * each stage of the walk.
+ */
+.macro break_before_make_ttbr_switch zero_page, page_table
+ msr ttbr1_el1, \zero_page
+ isb
+ tlbi vmalle1is
+ dsb ish
+ msr ttbr1_el1, \page_table
+ isb
+.endm
+
+
+/*
+ * Resume from hibernate
+ *
+ * Loads temporary page tables then restores the memory image.
+ * Finally branches to cpu_resume() to restore the state saved by
+ * swsusp_arch_suspend().
+ *
+ * Because this code has to be copied to a 'safe' page, it can't call out to
+ * other functions by PC-relative address. Also remember that it may be
+ * mid-way through over-writing other functions. For this reason it contains
+ * code from flush_icache_range() and uses the copy_page() macro.
+ *
+ * This 'safe' page is mapped via ttbr0, and executed from there. This function
+ * switches to a copy of the linear map in ttbr1, performs the restore, then
+ * switches ttbr1 to the original kernel's swapper_pg_dir.
+ *
+ * All of memory gets written to, including code. We need to clean the kernel
+ * text to the Point of Coherence (PoC) before secondary cores can be booted.
+ * Because the kernel modules and executable pages mapped to user space are
+ * also written as data, we clean all pages we touch to the Point of
+ * Unification (PoU).
+ *
+ * x0: physical address of temporary page tables
+ * x1: physical address of swapper page tables
+ * x2: address of cpu_resume
+ * x3: linear map address of restore_pblist in the current kernel
+ * x4: physical address of __hyp_stub_vectors, or 0
+ * x5: physical address of a zero page that remains zero after resume
+ */
+.pushsection ".hibernate_exit.text", "ax"
+ENTRY(swsusp_arch_suspend_exit)
+ /*
+ * We execute from ttbr0, change ttbr1 to our copied linear map tables
+ * with a break-before-make via the zero page
+ */
+ break_before_make_ttbr_switch x5, x0
+
+ mov x21, x1
+ mov x30, x2
+ mov x24, x4
+ mov x25, x5
+
+ /* walk the restore_pblist and use copy_page() to over-write memory */
+ mov x19, x3
+
+1: ldr x10, [x19, #HIBERN_PBE_ORIG]
+ mov x0, x10
+ ldr x1, [x19, #HIBERN_PBE_ADDR]
+
+ copy_page x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
+
+ add x1, x10, #PAGE_SIZE
+ /* Clean the copied page to PoU - based on flush_icache_range() */
+ dcache_line_size x2, x3
+ sub x3, x2, #1
+ bic x4, x10, x3
+2: dc cvau, x4 /* clean D line / unified line */
+ add x4, x4, x2
+ cmp x4, x1
+ b.lo 2b
+
+ ldr x19, [x19, #HIBERN_PBE_NEXT]
+ cbnz x19, 1b
+ dsb ish /* wait for PoU cleaning to finish */
+
+ /* switch to the restored kernels page tables */
+ break_before_make_ttbr_switch x25, x21
+
+ ic ialluis
+ dsb ish
+ isb
+
+ cbz x24, 3f /* Do we need to re-initialise EL2? */
+ hvc #0
+3: ret
+
+ .ltorg
+ENDPROC(swsusp_arch_suspend_exit)
+
+/*
+ * Restore the hyp stub.
+ * This must be done before the hibernate page is unmapped by _cpu_resume(),
+ * but happens before any of the hyp-stub's code is cleaned to PoC.
+ *
+ * x24: The physical address of __hyp_stub_vectors
+ */
+el1_sync:
+ msr vbar_el2, x24
+ eret
+ENDPROC(el1_sync)
+
+.macro invalid_vector label
+\label:
+ b \label
+ENDPROC(\label)
+.endm
+
+ invalid_vector el2_sync_invalid
+ invalid_vector el2_irq_invalid
+ invalid_vector el2_fiq_invalid
+ invalid_vector el2_error_invalid
+ invalid_vector el1_sync_invalid
+ invalid_vector el1_irq_invalid
+ invalid_vector el1_fiq_invalid
+ invalid_vector el1_error_invalid
+
+/* el2 vectors - switch el2 here while we restore the memory image. */
+ .align 11
+ENTRY(hibernate_el2_vectors)
+ ventry el2_sync_invalid // Synchronous EL2t
+ ventry el2_irq_invalid // IRQ EL2t
+ ventry el2_fiq_invalid // FIQ EL2t
+ ventry el2_error_invalid // Error EL2t
+
+ ventry el2_sync_invalid // Synchronous EL2h
+ ventry el2_irq_invalid // IRQ EL2h
+ ventry el2_fiq_invalid // FIQ EL2h
+ ventry el2_error_invalid // Error EL2h
+
+ ventry el1_sync // Synchronous 64-bit EL1
+ ventry el1_irq_invalid // IRQ 64-bit EL1
+ ventry el1_fiq_invalid // FIQ 64-bit EL1
+ ventry el1_error_invalid // Error 64-bit EL1
+
+ ventry el1_sync_invalid // Synchronous 32-bit EL1
+ ventry el1_irq_invalid // IRQ 32-bit EL1
+ ventry el1_fiq_invalid // FIQ 32-bit EL1
+ ventry el1_error_invalid // Error 32-bit EL1
+END(hibernate_el2_vectors)
+
+.popsection
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
new file mode 100644
index 000000000000..f8df75d740f4
--- /dev/null
+++ b/arch/arm64/kernel/hibernate.c
@@ -0,0 +1,487 @@
+/*:
+ * Hibernate support specific for ARM64
+ *
+ * Derived from work on ARM hibernation support by:
+ *
+ * Ubuntu project, hibernation support for mach-dove
+ * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu)
+ * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.)
+ * https://lkml.org/lkml/2010/6/18/4
+ * https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html
+ * https://patchwork.kernel.org/patch/96442/
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#define pr_fmt(x) "hibernate: " x
+#include <linux/kvm_host.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/pm.h>
+#include <linux/sched.h>
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+#include <asm/barrier.h>
+#include <asm/cacheflush.h>
+#include <asm/irqflags.h>
+#include <asm/memory.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/pgtable-hwdef.h>
+#include <asm/sections.h>
+#include <asm/suspend.h>
+#include <asm/virt.h>
+
+/*
+ * Hibernate core relies on this value being 0 on resume, and marks it
+ * __nosavedata assuming it will keep the resume kernel's '0' value. This
+ * doesn't happen with either KASLR.
+ *
+ * defined as "__visible int in_suspend __nosavedata" in
+ * kernel/power/hibernate.c
+ */
+extern int in_suspend;
+
+/* Find a symbols alias in the linear map */
+#define LMADDR(x) phys_to_virt(virt_to_phys(x))
+
+/* Do we need to reset el2? */
+#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode())
+
+/*
+ * Start/end of the hibernate exit code, this must be copied to a 'safe'
+ * location in memory, and executed from there.
+ */
+extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
+
+/* temporary el2 vectors in the __hibernate_exit_text section. */
+extern char hibernate_el2_vectors[];
+
+/* hyp-stub vectors, used to restore el2 during resume from hibernate. */
+extern char __hyp_stub_vectors[];
+
+/*
+ * Values that may not change over hibernate/resume. We put the build number
+ * and date in here so that we guarantee not to resume with a different
+ * kernel.
+ */
+struct arch_hibernate_hdr_invariants {
+ char uts_version[__NEW_UTS_LEN + 1];
+};
+
+/* These values need to be know across a hibernate/restore. */
+static struct arch_hibernate_hdr {
+ struct arch_hibernate_hdr_invariants invariants;
+
+ /* These are needed to find the relocated kernel if built with kaslr */
+ phys_addr_t ttbr1_el1;
+ void (*reenter_kernel)(void);
+
+ /*
+ * We need to know where the __hyp_stub_vectors are after restore to
+ * re-configure el2.
+ */
+ phys_addr_t __hyp_stub_vectors;
+} resume_hdr;
+
+static inline void arch_hdr_invariants(struct arch_hibernate_hdr_invariants *i)
+{
+ memset(i, 0, sizeof(*i));
+ memcpy(i->uts_version, init_utsname()->version, sizeof(i->uts_version));
+}
+
+int pfn_is_nosave(unsigned long pfn)
+{
+ unsigned long nosave_begin_pfn = virt_to_pfn(&__nosave_begin);
+ unsigned long nosave_end_pfn = virt_to_pfn(&__nosave_end - 1);
+
+ return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn);
+}
+
+void notrace save_processor_state(void)
+{
+ WARN_ON(num_online_cpus() != 1);
+}
+
+void notrace restore_processor_state(void)
+{
+}
+
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+ struct arch_hibernate_hdr *hdr = addr;
+
+ if (max_size < sizeof(*hdr))
+ return -EOVERFLOW;
+
+ arch_hdr_invariants(&hdr->invariants);
+ hdr->ttbr1_el1 = virt_to_phys(swapper_pg_dir);
+ hdr->reenter_kernel = _cpu_resume;
+
+ /* We can't use __hyp_get_vectors() because kvm may still be loaded */
+ if (el2_reset_needed())
+ hdr->__hyp_stub_vectors = virt_to_phys(__hyp_stub_vectors);
+ else
+ hdr->__hyp_stub_vectors = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL(arch_hibernation_header_save);
+
+int arch_hibernation_header_restore(void *addr)
+{
+ struct arch_hibernate_hdr_invariants invariants;
+ struct arch_hibernate_hdr *hdr = addr;
+
+ arch_hdr_invariants(&invariants);
+ if (memcmp(&hdr->invariants, &invariants, sizeof(invariants))) {
+ pr_crit("Hibernate image not generated by this kernel!\n");
+ return -EINVAL;
+ }
+
+ resume_hdr = *hdr;
+
+ return 0;
+}
+EXPORT_SYMBOL(arch_hibernation_header_restore);
+
+/*
+ * Copies length bytes, starting at src_start into an new page,
+ * perform cache maintentance, then maps it at the specified address low
+ * address as executable.
+ *
+ * This is used by hibernate to copy the code it needs to execute when
+ * overwriting the kernel text. This function generates a new set of page
+ * tables, which it loads into ttbr0.
+ *
+ * Length is provided as we probably only want 4K of data, even on a 64K
+ * page system.
+ */
+static int create_safe_exec_page(void *src_start, size_t length,
+ unsigned long dst_addr,
+ phys_addr_t *phys_dst_addr,
+ void *(*allocator)(gfp_t mask),
+ gfp_t mask)
+{
+ int rc = 0;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long dst = (unsigned long)allocator(mask);
+
+ if (!dst) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ memcpy((void *)dst, src_start, length);
+ flush_icache_range(dst, dst + length);
+
+ pgd = pgd_offset_raw(allocator(mask), dst_addr);
+ if (pgd_none(*pgd)) {
+ pud = allocator(mask);
+ if (!pud) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ pgd_populate(&init_mm, pgd, pud);
+ }
+
+ pud = pud_offset(pgd, dst_addr);
+ if (pud_none(*pud)) {
+ pmd = allocator(mask);
+ if (!pmd) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ pud_populate(&init_mm, pud, pmd);
+ }
+
+ pmd = pmd_offset(pud, dst_addr);
+ if (pmd_none(*pmd)) {
+ pte = allocator(mask);
+ if (!pte) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ pmd_populate_kernel(&init_mm, pmd, pte);
+ }
+
+ pte = pte_offset_kernel(pmd, dst_addr);
+ set_pte(pte, __pte(virt_to_phys((void *)dst) |
+ pgprot_val(PAGE_KERNEL_EXEC)));
+
+ /* Load our new page tables */
+ asm volatile("msr ttbr0_el1, %0;"
+ "isb;"
+ "tlbi vmalle1is;"
+ "dsb ish;"
+ "isb" : : "r"(virt_to_phys(pgd)));
+
+ *phys_dst_addr = virt_to_phys((void *)dst);
+
+out:
+ return rc;
+}
+
+
+int swsusp_arch_suspend(void)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct sleep_stack_data state;
+
+ local_dbg_save(flags);
+
+ if (__cpu_suspend_enter(&state)) {
+ ret = swsusp_save();
+ } else {
+ /* Clean kernel to PoC for secondary core startup */
+ __flush_dcache_area(LMADDR(KERNEL_START), KERNEL_END - KERNEL_START);
+
+ /*
+ * Tell the hibernation core that we've just restored
+ * the memory
+ */
+ in_suspend = 0;
+
+ __cpu_suspend_exit();
+ }
+
+ local_dbg_restore(flags);
+
+ return ret;
+}
+
+static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start,
+ unsigned long end)
+{
+ pte_t *src_pte;
+ pte_t *dst_pte;
+ unsigned long addr = start;
+
+ dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC);
+ if (!dst_pte)
+ return -ENOMEM;
+ pmd_populate_kernel(&init_mm, dst_pmd, dst_pte);
+ dst_pte = pte_offset_kernel(dst_pmd, start);
+
+ src_pte = pte_offset_kernel(src_pmd, start);
+ do {
+ if (!pte_none(*src_pte))
+ /*
+ * Resume will overwrite areas that may be marked
+ * read only (code, rodata). Clear the RDONLY bit from
+ * the temporary mappings we use during restore.
+ */
+ set_pte(dst_pte, __pte(pte_val(*src_pte) & ~PTE_RDONLY));
+ } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+
+ return 0;
+}
+
+static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start,
+ unsigned long end)
+{
+ pmd_t *src_pmd;
+ pmd_t *dst_pmd;
+ unsigned long next;
+ unsigned long addr = start;
+
+ if (pud_none(*dst_pud)) {
+ dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+ if (!dst_pmd)
+ return -ENOMEM;
+ pud_populate(&init_mm, dst_pud, dst_pmd);
+ }
+ dst_pmd = pmd_offset(dst_pud, start);
+
+ src_pmd = pmd_offset(src_pud, start);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(*src_pmd))
+ continue;
+ if (pmd_table(*src_pmd)) {
+ if (copy_pte(dst_pmd, src_pmd, addr, next))
+ return -ENOMEM;
+ } else {
+ set_pmd(dst_pmd,
+ __pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY));
+ }
+ } while (dst_pmd++, src_pmd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start,
+ unsigned long end)
+{
+ pud_t *dst_pud;
+ pud_t *src_pud;
+ unsigned long next;
+ unsigned long addr = start;
+
+ if (pgd_none(*dst_pgd)) {
+ dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+ if (!dst_pud)
+ return -ENOMEM;
+ pgd_populate(&init_mm, dst_pgd, dst_pud);
+ }
+ dst_pud = pud_offset(dst_pgd, start);
+
+ src_pud = pud_offset(src_pgd, start);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none(*src_pud))
+ continue;
+ if (pud_table(*(src_pud))) {
+ if (copy_pmd(dst_pud, src_pud, addr, next))
+ return -ENOMEM;
+ } else {
+ set_pud(dst_pud,
+ __pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY));
+ }
+ } while (dst_pud++, src_pud++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int copy_page_tables(pgd_t *dst_pgd, unsigned long start,
+ unsigned long end)
+{
+ unsigned long next;
+ unsigned long addr = start;
+ pgd_t *src_pgd = pgd_offset_k(start);
+
+ dst_pgd = pgd_offset_raw(dst_pgd, start);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(*src_pgd))
+ continue;
+ if (copy_pud(dst_pgd, src_pgd, addr, next))
+ return -ENOMEM;
+ } while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+ return 0;
+}
+
+/*
+ * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit().
+ *
+ * Memory allocated by get_safe_page() will be dealt with by the hibernate code,
+ * we don't need to free it here.
+ */
+int swsusp_arch_resume(void)
+{
+ int rc = 0;
+ void *zero_page;
+ size_t exit_size;
+ pgd_t *tmp_pg_dir;
+ void *lm_restore_pblist;
+ phys_addr_t phys_hibernate_exit;
+ void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
+ void *, phys_addr_t, phys_addr_t);
+
+ /*
+ * Locate the exit code in the bottom-but-one page, so that *NULL
+ * still has disastrous affects.
+ */
+ hibernate_exit = (void *)PAGE_SIZE;
+ exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start;
+ /*
+ * Copy swsusp_arch_suspend_exit() to a safe page. This will generate
+ * a new set of ttbr0 page tables and load them.
+ */
+ rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size,
+ (unsigned long)hibernate_exit,
+ &phys_hibernate_exit,
+ (void *)get_safe_page, GFP_ATOMIC);
+ if (rc) {
+ pr_err("Failed to create safe executable page for hibernate_exit code.");
+ goto out;
+ }
+
+ /*
+ * The hibernate exit text contains a set of el2 vectors, that will
+ * be executed at el2 with the mmu off in order to reload hyp-stub.
+ */
+ __flush_dcache_area(hibernate_exit, exit_size);
+
+ /*
+ * Restoring the memory image will overwrite the ttbr1 page tables.
+ * Create a second copy of just the linear map, and use this when
+ * restoring.
+ */
+ tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+ if (!tmp_pg_dir) {
+ pr_err("Failed to allocate memory for temporary page tables.");
+ rc = -ENOMEM;
+ goto out;
+ }
+ rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0);
+ if (rc)
+ goto out;
+
+ /*
+ * Since we only copied the linear map, we need to find restore_pblist's
+ * linear map address.
+ */
+ lm_restore_pblist = LMADDR(restore_pblist);
+
+ /*
+ * KASLR will cause the el2 vectors to be in a different location in
+ * the resumed kernel. Load hibernate's temporary copy into el2.
+ *
+ * We can skip this step if we booted at EL1, or are running with VHE.
+ */
+ if (el2_reset_needed()) {
+ phys_addr_t el2_vectors = phys_hibernate_exit; /* base */
+ el2_vectors += hibernate_el2_vectors -
+ __hibernate_exit_text_start; /* offset */
+
+ __hyp_set_vectors(el2_vectors);
+ }
+
+ /*
+ * We need a zero page that is zero before & after resume in order to
+ * to break before make on the ttbr1 page tables.
+ */
+ zero_page = (void *)get_safe_page(GFP_ATOMIC);
+
+ hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1,
+ resume_hdr.reenter_kernel, lm_restore_pblist,
+ resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page));
+
+out:
+ return rc;
+}
+
+static int check_boot_cpu_online_pm_callback(struct notifier_block *nb,
+ unsigned long action, void *ptr)
+{
+ if (action == PM_HIBERNATION_PREPARE &&
+ cpumask_first(cpu_online_mask) != 0) {
+ pr_warn("CPU0 is offline.\n");
+ return notifier_from_errno(-ENODEV);
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init check_boot_cpu_online_init(void)
+{
+ /*
+ * Set this pm_notifier callback with a lower priority than
+ * cpu_hotplug_pm_callback, so that cpu_hotplug_pm_callback will be
+ * called earlier to disable cpu hotplug before the cpu online check.
+ */
+ pm_notifier(check_boot_cpu_online_pm_callback, -INT_MAX);
+
+ return 0;
+}
+core_initcall(check_boot_cpu_online_init);
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index b45c95d34b83..367a954f9937 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -24,6 +24,7 @@
#include <linux/cpu_pm.h>
#include <linux/errno.h>
#include <linux/hw_breakpoint.h>
+#include <linux/kprobes.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/smp.h>
@@ -127,6 +128,7 @@ static u64 read_wb_reg(int reg, int n)
return val;
}
+NOKPROBE_SYMBOL(read_wb_reg);
static void write_wb_reg(int reg, int n, u64 val)
{
@@ -140,6 +142,7 @@ static void write_wb_reg(int reg, int n, u64 val)
}
isb();
}
+NOKPROBE_SYMBOL(write_wb_reg);
/*
* Convert a breakpoint privilege level to the corresponding exception
@@ -157,6 +160,7 @@ static enum dbg_active_el debug_exception_level(int privilege)
return -EINVAL;
}
}
+NOKPROBE_SYMBOL(debug_exception_level);
enum hw_breakpoint_ops {
HW_BREAKPOINT_INSTALL,
@@ -575,6 +579,7 @@ static void toggle_bp_registers(int reg, enum dbg_active_el el, int enable)
write_wb_reg(reg, i, ctrl);
}
}
+NOKPROBE_SYMBOL(toggle_bp_registers);
/*
* Debug exception handlers.
@@ -654,6 +659,7 @@ unlock:
return 0;
}
+NOKPROBE_SYMBOL(breakpoint_handler);
static int watchpoint_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
@@ -756,6 +762,7 @@ unlock:
return 0;
}
+NOKPROBE_SYMBOL(watchpoint_handler);
/*
* Handle single-step exception.
@@ -813,6 +820,7 @@ int reinstall_suspended_bps(struct pt_regs *regs)
return !handled_exception;
}
+NOKPROBE_SYMBOL(reinstall_suspended_bps);
/*
* Context-switcher for restoring suspended breakpoints.
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index a272f335c289..8727f4490772 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -22,6 +22,8 @@
#include <linux/irqchip/arm-gic-v3.h>
#include <asm/assembler.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
#include <asm/ptrace.h>
#include <asm/virt.h>
@@ -53,15 +55,26 @@ ENDPROC(__hyp_stub_vectors)
.align 11
el1_sync:
- mrs x1, esr_el2
- lsr x1, x1, #26
- cmp x1, #0x16
- b.ne 2f // Not an HVC trap
- cbz x0, 1f
- msr vbar_el2, x0 // Set vbar_el2
- b 2f
-1: mrs x0, vbar_el2 // Return vbar_el2
-2: eret
+ mrs x30, esr_el2
+ lsr x30, x30, #ESR_ELx_EC_SHIFT
+
+ cmp x30, #ESR_ELx_EC_HVC64
+ b.ne 9f // Not an HVC trap
+
+ cmp x0, #HVC_GET_VECTORS
+ b.ne 1f
+ mrs x0, vbar_el2
+ b 9f
+
+1: cmp x0, #HVC_SET_VECTORS
+ b.ne 2f
+ msr vbar_el2, x1
+ b 9f
+
+ /* Someone called kvm_call_hyp() against the hyp-stub... */
+2: mov x0, #ARM_EXCEPTION_HYP_GONE
+
+9: eret
ENDPROC(el1_sync)
.macro invalid_vector label
@@ -101,10 +114,18 @@ ENDPROC(\label)
*/
ENTRY(__hyp_get_vectors)
- mov x0, xzr
- // fall through
-ENTRY(__hyp_set_vectors)
+ str lr, [sp, #-16]!
+ mov x0, #HVC_GET_VECTORS
hvc #0
+ ldr lr, [sp], #16
ret
ENDPROC(__hyp_get_vectors)
+
+ENTRY(__hyp_set_vectors)
+ str lr, [sp, #-16]!
+ mov x1, x0
+ mov x0, #HVC_SET_VECTORS
+ hvc #0
+ ldr lr, [sp], #16
+ ret
ENDPROC(__hyp_set_vectors)
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index bc2abb8b1599..f0be31f1dd45 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -26,31 +26,40 @@
* There aren't any ELF relocations we can use to endian-swap values known only
* at link time (e.g. the subtraction of two symbol addresses), so we must get
* the linker to endian-swap certain values before emitting them.
+ *
+ * Note that, in order for this to work when building the ELF64 PIE executable
+ * (for KASLR), these values should not be referenced via R_AARCH64_ABS64
+ * relocations, since these are fixed up at runtime rather than at build time
+ * when PIE is in effect. So we need to split them up in 32-bit high and low
+ * words.
*/
#ifdef CONFIG_CPU_BIG_ENDIAN
-#define DATA_LE64(data) \
- ((((data) & 0x00000000000000ff) << 56) | \
- (((data) & 0x000000000000ff00) << 40) | \
- (((data) & 0x0000000000ff0000) << 24) | \
- (((data) & 0x00000000ff000000) << 8) | \
- (((data) & 0x000000ff00000000) >> 8) | \
- (((data) & 0x0000ff0000000000) >> 24) | \
- (((data) & 0x00ff000000000000) >> 40) | \
- (((data) & 0xff00000000000000) >> 56))
+#define DATA_LE32(data) \
+ ((((data) & 0x000000ff) << 24) | \
+ (((data) & 0x0000ff00) << 8) | \
+ (((data) & 0x00ff0000) >> 8) | \
+ (((data) & 0xff000000) >> 24))
#else
-#define DATA_LE64(data) ((data) & 0xffffffffffffffff)
+#define DATA_LE32(data) ((data) & 0xffffffff)
#endif
+#define DEFINE_IMAGE_LE64(sym, data) \
+ sym##_lo32 = DATA_LE32((data) & 0xffffffff); \
+ sym##_hi32 = DATA_LE32((data) >> 32)
+
#ifdef CONFIG_CPU_BIG_ENDIAN
-#define __HEAD_FLAG_BE 1
+#define __HEAD_FLAG_BE 1
#else
-#define __HEAD_FLAG_BE 0
+#define __HEAD_FLAG_BE 0
#endif
-#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2)
+#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2)
-#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \
- (__HEAD_FLAG_PAGE_SIZE << 1))
+#define __HEAD_FLAG_PHYS_BASE 1
+
+#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \
+ (__HEAD_FLAG_PAGE_SIZE << 1) | \
+ (__HEAD_FLAG_PHYS_BASE << 3))
/*
* These will output as part of the Image header, which should be little-endian
@@ -58,12 +67,24 @@
* endian swapped in head.S, all are done here for consistency.
*/
#define HEAD_SYMBOLS \
- _kernel_size_le = DATA_LE64(_end - _text); \
- _kernel_offset_le = DATA_LE64(TEXT_OFFSET); \
- _kernel_flags_le = DATA_LE64(__HEAD_FLAGS);
+ DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \
+ DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \
+ DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS);
#ifdef CONFIG_EFI
+__efistub_stext_offset = stext - _text;
+
+/*
+ * Prevent the symbol aliases below from being emitted into the kallsyms
+ * table, by forcing them to be absolute symbols (which are conveniently
+ * ignored by scripts/kallsyms) rather than section relative symbols.
+ * The distinction is only relevant for partial linking, and only for symbols
+ * that are defined within a section declaration (which is not the case for
+ * the definitions below) so the resulting values will be identical.
+ */
+#define KALLSYMS_HIDE(sym) ABSOLUTE(sym)
+
/*
* The EFI stub has its own symbol namespace prefixed by __efistub_, to
* isolate it from the kernel proper. The following symbols are legally
@@ -73,25 +94,25 @@
* linked at. The routines below are all implemented in assembler in a
* position independent manner
*/
-__efistub_memcmp = __pi_memcmp;
-__efistub_memchr = __pi_memchr;
-__efistub_memcpy = __pi_memcpy;
-__efistub_memmove = __pi_memmove;
-__efistub_memset = __pi_memset;
-__efistub_strlen = __pi_strlen;
-__efistub_strcmp = __pi_strcmp;
-__efistub_strncmp = __pi_strncmp;
-__efistub___flush_dcache_area = __pi___flush_dcache_area;
+__efistub_memcmp = KALLSYMS_HIDE(__pi_memcmp);
+__efistub_memchr = KALLSYMS_HIDE(__pi_memchr);
+__efistub_memcpy = KALLSYMS_HIDE(__pi_memcpy);
+__efistub_memmove = KALLSYMS_HIDE(__pi_memmove);
+__efistub_memset = KALLSYMS_HIDE(__pi_memset);
+__efistub_strlen = KALLSYMS_HIDE(__pi_strlen);
+__efistub_strcmp = KALLSYMS_HIDE(__pi_strcmp);
+__efistub_strncmp = KALLSYMS_HIDE(__pi_strncmp);
+__efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area);
#ifdef CONFIG_KASAN
-__efistub___memcpy = __pi_memcpy;
-__efistub___memmove = __pi_memmove;
-__efistub___memset = __pi_memset;
+__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
+__efistub___memmove = KALLSYMS_HIDE(__pi_memmove);
+__efistub___memset = KALLSYMS_HIDE(__pi_memset);
#endif
-__efistub__text = _text;
-__efistub__end = _end;
-__efistub__edata = _edata;
+__efistub__text = KALLSYMS_HIDE(_text);
+__efistub__end = KALLSYMS_HIDE(_end);
+__efistub__edata = KALLSYMS_HIDE(_edata);
#endif
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index c08b9ad6f429..750f422f3f2c 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -30,6 +30,7 @@
#include <asm/cacheflush.h>
#include <asm/debug-monitors.h>
#include <asm/fixmap.h>
+#include <asm/opcodes.h>
#include <asm/insn.h>
#define AARCH64_INSN_SF_BIT BIT(31)
@@ -162,6 +163,32 @@ static bool __kprobes __aarch64_insn_hotpatch_safe(u32 insn)
aarch64_insn_is_nop(insn);
}
+bool __kprobes aarch64_insn_uses_literal(u32 insn)
+{
+ /* ldr/ldrsw (literal), prfm */
+
+ return aarch64_insn_is_ldr_lit(insn) ||
+ aarch64_insn_is_ldrsw_lit(insn) ||
+ aarch64_insn_is_adr_adrp(insn) ||
+ aarch64_insn_is_prfm_lit(insn);
+}
+
+bool __kprobes aarch64_insn_is_branch(u32 insn)
+{
+ /* b, bl, cb*, tb*, b.cond, br, blr */
+
+ return aarch64_insn_is_b(insn) ||
+ aarch64_insn_is_bl(insn) ||
+ aarch64_insn_is_cbz(insn) ||
+ aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_tbz(insn) ||
+ aarch64_insn_is_tbnz(insn) ||
+ aarch64_insn_is_ret(insn) ||
+ aarch64_insn_is_br(insn) ||
+ aarch64_insn_is_blr(insn) ||
+ aarch64_insn_is_bcond(insn);
+}
+
/*
* ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
* Section B2.6.5 "Concurrent modification and execution of instructions":
@@ -1116,6 +1143,14 @@ u32 aarch64_set_branch_offset(u32 insn, s32 offset)
BUG();
}
+/*
+ * Extract the Op/CR data from a msr/mrs instruction.
+ */
+u32 aarch64_insn_extract_system_reg(u32 insn)
+{
+ return (insn & 0x1FFFE0) >> 5;
+}
+
bool aarch32_insn_is_wide(u32 insn)
{
return insn >= 0xe800;
@@ -1141,3 +1176,101 @@ u32 aarch32_insn_mcr_extract_crm(u32 insn)
{
return insn & CRM_MASK;
}
+
+static bool __kprobes __check_eq(unsigned long pstate)
+{
+ return (pstate & PSR_Z_BIT) != 0;
+}
+
+static bool __kprobes __check_ne(unsigned long pstate)
+{
+ return (pstate & PSR_Z_BIT) == 0;
+}
+
+static bool __kprobes __check_cs(unsigned long pstate)
+{
+ return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_cc(unsigned long pstate)
+{
+ return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_mi(unsigned long pstate)
+{
+ return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_pl(unsigned long pstate)
+{
+ return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_vs(unsigned long pstate)
+{
+ return (pstate & PSR_V_BIT) != 0;
+}
+
+static bool __kprobes __check_vc(unsigned long pstate)
+{
+ return (pstate & PSR_V_BIT) == 0;
+}
+
+static bool __kprobes __check_hi(unsigned long pstate)
+{
+ pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */
+ return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_ls(unsigned long pstate)
+{
+ pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */
+ return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_ge(unsigned long pstate)
+{
+ pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */
+ return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_lt(unsigned long pstate)
+{
+ pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */
+ return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_gt(unsigned long pstate)
+{
+ /*PSR_N_BIT ^= PSR_V_BIT */
+ unsigned long temp = pstate ^ (pstate << 3);
+
+ temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */
+ return (temp & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_le(unsigned long pstate)
+{
+ /*PSR_N_BIT ^= PSR_V_BIT */
+ unsigned long temp = pstate ^ (pstate << 3);
+
+ temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */
+ return (temp & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_al(unsigned long pstate)
+{
+ return true;
+}
+
+/*
+ * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that
+ * it behaves identically to 0b1110 ("al").
+ */
+pstate_check_t * const aarch32_opcode_cond_checks[16] = {
+ __check_eq, __check_ne, __check_cs, __check_cc,
+ __check_mi, __check_pl, __check_vs, __check_vc,
+ __check_hi, __check_ls, __check_ge, __check_lt,
+ __check_gt, __check_le, __check_al, __check_al
+};
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 9f17ec071ee0..2386b26c0712 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -30,6 +30,9 @@
unsigned long irq_err_count;
+/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */
+DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16);
+
int arch_show_interrupts(struct seq_file *p, int prec)
{
show_ipi_list(p, prec);
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
new file mode 100644
index 000000000000..b05469173ba5
--- /dev/null
+++ b/arch/arm64/kernel/kaslr.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/libfdt.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include <asm/fixmap.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/memory.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+
+u64 __read_mostly module_alloc_base;
+u16 __initdata memstart_offset_seed;
+
+static __init u64 get_kaslr_seed(void *fdt)
+{
+ int node, len;
+ u64 *prop;
+ u64 ret;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ return 0;
+
+ prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
+ if (!prop || len != sizeof(u64))
+ return 0;
+
+ ret = fdt64_to_cpu(*prop);
+ *prop = 0;
+ return ret;
+}
+
+static __init const u8 *get_cmdline(void *fdt)
+{
+ static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE;
+
+ if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) {
+ int node;
+ const u8 *prop;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ goto out;
+
+ prop = fdt_getprop(fdt, node, "bootargs", NULL);
+ if (!prop)
+ goto out;
+ return prop;
+ }
+out:
+ return default_cmdline;
+}
+
+extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
+ pgprot_t prot);
+
+/*
+ * This routine will be executed with the kernel mapped at its default virtual
+ * address, and if it returns successfully, the kernel will be remapped, and
+ * start_kernel() will be executed from a randomized virtual offset. The
+ * relocation will result in all absolute references (e.g., static variables
+ * containing function pointers) to be reinitialized, and zero-initialized
+ * .bss variables will be reset to 0.
+ */
+u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
+{
+ void *fdt;
+ u64 seed, offset, mask, module_range;
+ const u8 *cmdline, *str;
+ int size;
+
+ /*
+ * Set a reasonable default for module_alloc_base in case
+ * we end up running with module randomization disabled.
+ */
+ module_alloc_base = (u64)_etext - MODULES_VSIZE;
+
+ /*
+ * Try to map the FDT early. If this fails, we simply bail,
+ * and proceed with KASLR disabled. We will make another
+ * attempt at mapping the FDT in setup_machine()
+ */
+ early_fixmap_init();
+ fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL);
+ if (!fdt)
+ return 0;
+
+ /*
+ * Retrieve (and wipe) the seed from the FDT
+ */
+ seed = get_kaslr_seed(fdt);
+ if (!seed)
+ return 0;
+
+ /*
+ * Check if 'nokaslr' appears on the command line, and
+ * return 0 if that is the case.
+ */
+ cmdline = get_cmdline(fdt);
+ str = strstr(cmdline, "nokaslr");
+ if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
+ return 0;
+
+ /*
+ * OK, so we are proceeding with KASLR enabled. Calculate a suitable
+ * kernel image offset from the seed. Let's place the kernel in the
+ * lower half of the VMALLOC area (VA_BITS - 2).
+ * Even if we could randomize at page granularity for 16k and 64k pages,
+ * let's always round to 2 MB so we don't interfere with the ability to
+ * map using contiguous PTEs
+ */
+ mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1);
+ offset = seed & mask;
+
+ /* use the top 16 bits to randomize the linear region */
+ memstart_offset_seed = seed >> 48;
+
+ /*
+ * The kernel Image should not extend across a 1GB/32MB/512MB alignment
+ * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
+ * happens, increase the KASLR offset by the size of the kernel image.
+ */
+ if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) !=
+ (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT))
+ offset = (offset + (u64)(_end - _text)) & mask;
+
+ if (IS_ENABLED(CONFIG_KASAN))
+ /*
+ * KASAN does not expect the module region to intersect the
+ * vmalloc region, since shadow memory is allocated for each
+ * module at load time, whereas the vmalloc region is shadowed
+ * by KASAN zero pages. So keep modules out of the vmalloc
+ * region if KASAN is enabled.
+ */
+ return offset;
+
+ if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
+ /*
+ * Randomize the module region independently from the core
+ * kernel. This prevents modules from leaking any information
+ * about the address of the kernel itself, but results in
+ * branches between modules and the core kernel that are
+ * resolved via PLTs. (Branches between modules will be
+ * resolved normally.)
+ */
+ module_range = VMALLOC_END - VMALLOC_START - MODULES_VSIZE;
+ module_alloc_base = VMALLOC_START;
+ } else {
+ /*
+ * Randomize the module region by setting module_alloc_base to
+ * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
+ * _stext) . This guarantees that the resulting region still
+ * covers [_stext, _etext], and that all relative branches can
+ * be resolved without veneers.
+ */
+ module_range = MODULES_VSIZE - (u64)(_etext - _stext);
+ module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
+ }
+
+ /* use the lower 21 bits to randomize the base of the module region */
+ module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
+ module_alloc_base &= PAGE_MASK;
+
+ return offset;
+}
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
index bcac81e600b9..814d0c51b2f9 100644
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -22,6 +22,7 @@
#include <linux/irq.h>
#include <linux/kdebug.h>
#include <linux/kgdb.h>
+#include <linux/kprobes.h>
#include <asm/traps.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
@@ -218,6 +219,7 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr)
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return 0;
}
+NOKPROBE_SYMBOL(kgdb_brk_fn)
static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr)
{
@@ -226,12 +228,14 @@ static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr)
return 0;
}
+NOKPROBE_SYMBOL(kgdb_compiled_brk_fn);
static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr)
{
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return 0;
}
+NOKPROBE_SYMBOL(kgdb_step_brk_fn);
static struct break_hook kgdb_brkpt_hook = {
.esr_mask = 0xffffffff,
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
new file mode 100644
index 000000000000..1ce90d8450ae
--- /dev/null
+++ b/arch/arm64/kernel/module-plts.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2014-2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/elf.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+
+struct plt_entry {
+ /*
+ * A program that conforms to the AArch64 Procedure Call Standard
+ * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or
+ * IP1 (x17) may be inserted at any branch instruction that is
+ * exposed to a relocation that supports long branches. Since that
+ * is exactly what we are dealing with here, we are free to use x16
+ * as a scratch register in the PLT veneers.
+ */
+ __le32 mov0; /* movn x16, #0x.... */
+ __le32 mov1; /* movk x16, #0x...., lsl #16 */
+ __le32 mov2; /* movk x16, #0x...., lsl #32 */
+ __le32 br; /* br x16 */
+};
+
+u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
+ Elf64_Sym *sym)
+{
+ struct plt_entry *plt = (struct plt_entry *)mod->arch.plt->sh_addr;
+ int i = mod->arch.plt_num_entries;
+ u64 val = sym->st_value + rela->r_addend;
+
+ /*
+ * We only emit PLT entries against undefined (SHN_UNDEF) symbols,
+ * which are listed in the ELF symtab section, but without a type
+ * or a size.
+ * So, similar to how the module loader uses the Elf64_Sym::st_value
+ * field to store the resolved addresses of undefined symbols, let's
+ * borrow the Elf64_Sym::st_size field (whose value is never used by
+ * the module loader, even for symbols that are defined) to record
+ * the address of a symbol's associated PLT entry as we emit it for a
+ * zero addend relocation (which is the only kind we have to deal with
+ * in practice). This allows us to find duplicates without having to
+ * go through the table every time.
+ */
+ if (rela->r_addend == 0 && sym->st_size != 0) {
+ BUG_ON(sym->st_size < (u64)plt || sym->st_size >= (u64)&plt[i]);
+ return sym->st_size;
+ }
+
+ mod->arch.plt_num_entries++;
+ BUG_ON(mod->arch.plt_num_entries > mod->arch.plt_max_entries);
+
+ /*
+ * MOVK/MOVN/MOVZ opcode:
+ * +--------+------------+--------+-----------+-------------+---------+
+ * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] |
+ * +--------+------------+--------+-----------+-------------+---------+
+ *
+ * Rd := 0x10 (x16)
+ * hw := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32)
+ * opc := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ)
+ * sf := 1 (64-bit variant)
+ */
+ plt[i] = (struct plt_entry){
+ cpu_to_le32(0x92800010 | (((~val ) & 0xffff)) << 5),
+ cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5),
+ cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5),
+ cpu_to_le32(0xd61f0200)
+ };
+
+ if (rela->r_addend == 0)
+ sym->st_size = (u64)&plt[i];
+
+ return (u64)&plt[i];
+}
+
+#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rela(const void *a, const void *b)
+{
+ const Elf64_Rela *x = a, *y = b;
+ int i;
+
+ /* sort by type, symbol index and addend */
+ i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info));
+ if (i == 0)
+ i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info));
+ if (i == 0)
+ i = cmp_3way(x->r_addend, y->r_addend);
+ return i;
+}
+
+static bool duplicate_rel(const Elf64_Rela *rela, int num)
+{
+ /*
+ * Entries are sorted by type, symbol index and addend. That means
+ * that, if a duplicate entry exists, it must be in the preceding
+ * slot.
+ */
+ return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0;
+}
+
+static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num)
+{
+ unsigned int ret = 0;
+ Elf64_Sym *s;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ switch (ELF64_R_TYPE(rela[i].r_info)) {
+ case R_AARCH64_JUMP26:
+ case R_AARCH64_CALL26:
+ /*
+ * We only have to consider branch targets that resolve
+ * to undefined symbols. This is not simply a heuristic,
+ * it is a fundamental limitation, since the PLT itself
+ * is part of the module, and needs to be within 128 MB
+ * as well, so modules can never grow beyond that limit.
+ */
+ s = syms + ELF64_R_SYM(rela[i].r_info);
+ if (s->st_shndx != SHN_UNDEF)
+ break;
+
+ /*
+ * Jump relocations with non-zero addends against
+ * undefined symbols are supported by the ELF spec, but
+ * do not occur in practice (e.g., 'jump n bytes past
+ * the entry point of undefined function symbol f').
+ * So we need to support them, but there is no need to
+ * take them into consideration when trying to optimize
+ * this code. So let's only check for duplicates when
+ * the addend is zero: this allows us to record the PLT
+ * entry address in the symbol table itself, rather than
+ * having to search the list for duplicates each time we
+ * emit one.
+ */
+ if (rela[i].r_addend != 0 || !duplicate_rel(rela, i))
+ ret++;
+ break;
+ }
+ }
+ return ret;
+}
+
+int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod)
+{
+ unsigned long plt_max_entries = 0;
+ Elf64_Sym *syms = NULL;
+ int i;
+
+ /*
+ * Find the empty .plt section so we can expand it to store the PLT
+ * entries. Record the symtab address as well.
+ */
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (strcmp(".plt", secstrings + sechdrs[i].sh_name) == 0)
+ mod->arch.plt = sechdrs + i;
+ else if (sechdrs[i].sh_type == SHT_SYMTAB)
+ syms = (Elf64_Sym *)sechdrs[i].sh_addr;
+ }
+
+ if (!mod->arch.plt) {
+ pr_err("%s: module PLT section missing\n", mod->name);
+ return -ENOEXEC;
+ }
+ if (!syms) {
+ pr_err("%s: module symtab section missing\n", mod->name);
+ return -ENOEXEC;
+ }
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset;
+ int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
+ Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info;
+
+ if (sechdrs[i].sh_type != SHT_RELA)
+ continue;
+
+ /* ignore relocations that operate on non-exec sections */
+ if (!(dstsec->sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* sort by type, symbol index and addend */
+ sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL);
+
+ plt_max_entries += count_plts(syms, rels, numrels);
+ }
+
+ mod->arch.plt->sh_type = SHT_NOBITS;
+ mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+ mod->arch.plt->sh_addralign = L1_CACHE_BYTES;
+ mod->arch.plt->sh_size = plt_max_entries * sizeof(struct plt_entry);
+ mod->arch.plt_num_entries = 0;
+ mod->arch.plt_max_entries = plt_max_entries;
+ return 0;
+}
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index f4bc779e62e8..7f316982ce00 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -30,17 +30,30 @@
#include <asm/insn.h>
#include <asm/sections.h>
-#define AARCH64_INSN_IMM_MOVNZ AARCH64_INSN_IMM_MAX
-#define AARCH64_INSN_IMM_MOVK AARCH64_INSN_IMM_16
-
void *module_alloc(unsigned long size)
{
void *p;
- p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
+ p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+ module_alloc_base + MODULES_VSIZE,
GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
NUMA_NO_NODE, __builtin_return_address(0));
+ if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+ !IS_ENABLED(CONFIG_KASAN))
+ /*
+ * KASAN can only deal with module allocations being served
+ * from the reserved module region, since the remainder of
+ * the vmalloc region is already backed by zero shadow pages,
+ * and punching holes into it is non-trivial. Since the module
+ * region is not randomized when KASAN is enabled, it is even
+ * less likely that the module region gets exhausted, so we
+ * can simply omit this fallback in that case.
+ */
+ p = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START,
+ VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+ NUMA_NO_NODE, __builtin_return_address(0));
+
if (p && (kasan_module_alloc(p, size) < 0)) {
vfree(p);
return NULL;
@@ -75,15 +88,18 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val)
static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
{
- u64 imm_mask = (1 << len) - 1;
s64 sval = do_reloc(op, place, val);
switch (len) {
case 16:
*(s16 *)place = sval;
+ if (sval < S16_MIN || sval > U16_MAX)
+ return -ERANGE;
break;
case 32:
*(s32 *)place = sval;
+ if (sval < S32_MIN || sval > U32_MAX)
+ return -ERANGE;
break;
case 64:
*(s64 *)place = sval;
@@ -92,34 +108,23 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
pr_err("Invalid length (%d) for data relocation\n", len);
return 0;
}
-
- /*
- * Extract the upper value bits (including the sign bit) and
- * shift them to bit 0.
- */
- sval = (s64)(sval & ~(imm_mask >> 1)) >> (len - 1);
-
- /*
- * Overflow has occurred if the value is not representable in
- * len bits (i.e the bottom len bits are not sign-extended and
- * the top bits are not all zero).
- */
- if ((u64)(sval + 1) > 2)
- return -ERANGE;
-
return 0;
}
+enum aarch64_insn_movw_imm_type {
+ AARCH64_INSN_IMM_MOVNZ,
+ AARCH64_INSN_IMM_MOVKZ,
+};
+
static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
- int lsb, enum aarch64_insn_imm_type imm_type)
+ int lsb, enum aarch64_insn_movw_imm_type imm_type)
{
- u64 imm, limit = 0;
+ u64 imm;
s64 sval;
u32 insn = le32_to_cpu(*(u32 *)place);
sval = do_reloc(op, place, val);
- sval >>= lsb;
- imm = sval & 0xffff;
+ imm = sval >> lsb;
if (imm_type == AARCH64_INSN_IMM_MOVNZ) {
/*
@@ -128,7 +133,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
* immediate is less than zero.
*/
insn &= ~(3 << 29);
- if ((s64)imm >= 0) {
+ if (sval >= 0) {
/* >=0: Set the instruction to MOVZ (opcode 10b). */
insn |= 2 << 29;
} else {
@@ -140,29 +145,13 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
*/
imm = ~imm;
}
- imm_type = AARCH64_INSN_IMM_MOVK;
}
/* Update the instruction with the new encoding. */
- insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
+ insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
*(u32 *)place = cpu_to_le32(insn);
- /* Shift out the immediate field. */
- sval >>= 16;
-
- /*
- * For unsigned immediates, the overflow check is straightforward.
- * For signed immediates, the sign bit is actually the bit past the
- * most significant bit of the field.
- * The AARCH64_INSN_IMM_16 immediate type is unsigned.
- */
- if (imm_type != AARCH64_INSN_IMM_16) {
- sval++;
- limit++;
- }
-
- /* Check the upper bits depending on the sign of the immediate. */
- if ((u64)sval > limit)
+ if (imm > U16_MAX)
return -ERANGE;
return 0;
@@ -267,25 +256,25 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
overflow_check = false;
case R_AARCH64_MOVW_UABS_G0:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
- AARCH64_INSN_IMM_16);
+ AARCH64_INSN_IMM_MOVKZ);
break;
case R_AARCH64_MOVW_UABS_G1_NC:
overflow_check = false;
case R_AARCH64_MOVW_UABS_G1:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
- AARCH64_INSN_IMM_16);
+ AARCH64_INSN_IMM_MOVKZ);
break;
case R_AARCH64_MOVW_UABS_G2_NC:
overflow_check = false;
case R_AARCH64_MOVW_UABS_G2:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
- AARCH64_INSN_IMM_16);
+ AARCH64_INSN_IMM_MOVKZ);
break;
case R_AARCH64_MOVW_UABS_G3:
/* We're using the top bits so we can't overflow. */
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48,
- AARCH64_INSN_IMM_16);
+ AARCH64_INSN_IMM_MOVKZ);
break;
case R_AARCH64_MOVW_SABS_G0:
ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
@@ -302,7 +291,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
case R_AARCH64_MOVW_PREL_G0_NC:
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
- AARCH64_INSN_IMM_MOVK);
+ AARCH64_INSN_IMM_MOVKZ);
break;
case R_AARCH64_MOVW_PREL_G0:
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
@@ -311,7 +300,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
case R_AARCH64_MOVW_PREL_G1_NC:
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
- AARCH64_INSN_IMM_MOVK);
+ AARCH64_INSN_IMM_MOVKZ);
break;
case R_AARCH64_MOVW_PREL_G1:
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
@@ -320,7 +309,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
case R_AARCH64_MOVW_PREL_G2_NC:
overflow_check = false;
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
- AARCH64_INSN_IMM_MOVK);
+ AARCH64_INSN_IMM_MOVKZ);
break;
case R_AARCH64_MOVW_PREL_G2:
ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
@@ -388,6 +377,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
case R_AARCH64_CALL26:
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
AARCH64_INSN_IMM_26);
+
+ if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+ ovf == -ERANGE) {
+ val = module_emit_plt_entry(me, &rel[i], sym);
+ ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2,
+ 26, AARCH64_INSN_IMM_26);
+ }
break;
default:
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds
new file mode 100644
index 000000000000..8949f6c6f729
--- /dev/null
+++ b/arch/arm64/kernel/module.lds
@@ -0,0 +1,3 @@
+SECTIONS {
+ .plt (NOLOAD) : { BYTE(0) }
+}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 3aa74830cc69..ff4665462a02 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -164,8 +164,11 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
frame.fp = regs->regs[29];
frame.sp = regs->sp;
frame.pc = regs->pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ frame.graph = current->curr_ret_stack;
+#endif
- walk_stackframe(&frame, callchain_trace, entry);
+ walk_stackframe(current, &frame, callchain_trace, entry);
}
unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile
new file mode 100644
index 000000000000..ce06312e3d34
--- /dev/null
+++ b/arch/arm64/kernel/probes/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o \
+ kprobes_trampoline.o \
+ simulate-insn.o
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
new file mode 100644
index 000000000000..f7931d900bca
--- /dev/null
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -0,0 +1,174 @@
+/*
+ * arch/arm64/kernel/probes/decode-insn.c
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <asm/kprobes.h>
+#include <asm/insn.h>
+#include <asm/sections.h>
+
+#include "decode-insn.h"
+#include "simulate-insn.h"
+
+static bool __kprobes aarch64_insn_is_steppable(u32 insn)
+{
+ /*
+ * Branch instructions will write a new value into the PC which is
+ * likely to be relative to the XOL address and therefore invalid.
+ * Deliberate generation of an exception during stepping is also not
+ * currently safe. Lastly, MSR instructions can do any number of nasty
+ * things we can't handle during single-stepping.
+ */
+ if (aarch64_get_insn_class(insn) == AARCH64_INSN_CLS_BR_SYS) {
+ if (aarch64_insn_is_branch(insn) ||
+ aarch64_insn_is_msr_imm(insn) ||
+ aarch64_insn_is_msr_reg(insn) ||
+ aarch64_insn_is_exception(insn) ||
+ aarch64_insn_is_eret(insn))
+ return false;
+
+ /*
+ * The MRS instruction may not return a correct value when
+ * executing in the single-stepping environment. We do make one
+ * exception, for reading the DAIF bits.
+ */
+ if (aarch64_insn_is_mrs(insn))
+ return aarch64_insn_extract_system_reg(insn)
+ != AARCH64_INSN_SPCLREG_DAIF;
+
+ /*
+ * The HINT instruction is is problematic when single-stepping,
+ * except for the NOP case.
+ */
+ if (aarch64_insn_is_hint(insn))
+ return aarch64_insn_is_nop(insn);
+
+ return true;
+ }
+
+ /*
+ * Instructions which load PC relative literals are not going to work
+ * when executed from an XOL slot. Instructions doing an exclusive
+ * load/store are not going to complete successfully when single-step
+ * exception handling happens in the middle of the sequence.
+ */
+ if (aarch64_insn_uses_literal(insn) ||
+ aarch64_insn_is_exclusive(insn))
+ return false;
+
+ return true;
+}
+
+/* Return:
+ * INSN_REJECTED If instruction is one not allowed to kprobe,
+ * INSN_GOOD If instruction is supported and uses instruction slot,
+ * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot.
+ */
+static enum kprobe_insn __kprobes
+arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi)
+{
+ /*
+ * Instructions reading or modifying the PC won't work from the XOL
+ * slot.
+ */
+ if (aarch64_insn_is_steppable(insn))
+ return INSN_GOOD;
+
+ if (aarch64_insn_is_bcond(insn)) {
+ asi->handler = simulate_b_cond;
+ } else if (aarch64_insn_is_cbz(insn) ||
+ aarch64_insn_is_cbnz(insn)) {
+ asi->handler = simulate_cbz_cbnz;
+ } else if (aarch64_insn_is_tbz(insn) ||
+ aarch64_insn_is_tbnz(insn)) {
+ asi->handler = simulate_tbz_tbnz;
+ } else if (aarch64_insn_is_adr_adrp(insn)) {
+ asi->handler = simulate_adr_adrp;
+ } else if (aarch64_insn_is_b(insn) ||
+ aarch64_insn_is_bl(insn)) {
+ asi->handler = simulate_b_bl;
+ } else if (aarch64_insn_is_br(insn) ||
+ aarch64_insn_is_blr(insn) ||
+ aarch64_insn_is_ret(insn)) {
+ asi->handler = simulate_br_blr_ret;
+ } else if (aarch64_insn_is_ldr_lit(insn)) {
+ asi->handler = simulate_ldr_literal;
+ } else if (aarch64_insn_is_ldrsw_lit(insn)) {
+ asi->handler = simulate_ldrsw_literal;
+ } else {
+ /*
+ * Instruction cannot be stepped out-of-line and we don't
+ * (yet) simulate it.
+ */
+ return INSN_REJECTED;
+ }
+
+ return INSN_GOOD_NO_SLOT;
+}
+
+static bool __kprobes
+is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end)
+{
+ while (scan_start > scan_end) {
+ /*
+ * atomic region starts from exclusive load and ends with
+ * exclusive store.
+ */
+ if (aarch64_insn_is_store_ex(le32_to_cpu(*scan_start)))
+ return false;
+ else if (aarch64_insn_is_load_ex(le32_to_cpu(*scan_start)))
+ return true;
+ scan_start--;
+ }
+
+ return false;
+}
+
+enum kprobe_insn __kprobes
+arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
+{
+ enum kprobe_insn decoded;
+ kprobe_opcode_t insn = le32_to_cpu(*addr);
+ kprobe_opcode_t *scan_start = addr - 1;
+ kprobe_opcode_t *scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE;
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+ struct module *mod;
+#endif
+
+ if (addr >= (kprobe_opcode_t *)_text &&
+ scan_end < (kprobe_opcode_t *)_text)
+ scan_end = (kprobe_opcode_t *)_text;
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+ else {
+ preempt_disable();
+ mod = __module_address((unsigned long)addr);
+ if (mod && within_module_init((unsigned long)addr, mod) &&
+ !within_module_init((unsigned long)scan_end, mod))
+ scan_end = (kprobe_opcode_t *)mod->module_init;
+ else if (mod && within_module_core((unsigned long)addr, mod) &&
+ !within_module_core((unsigned long)scan_end, mod))
+ scan_end = (kprobe_opcode_t *)mod->module_core;
+ preempt_enable();
+ }
+#endif
+ decoded = arm_probe_decode_insn(insn, asi);
+
+ if (decoded == INSN_REJECTED ||
+ is_probed_address_atomic(scan_start, scan_end))
+ return INSN_REJECTED;
+
+ return decoded;
+}
diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h
new file mode 100644
index 000000000000..d438289646a6
--- /dev/null
+++ b/arch/arm64/kernel/probes/decode-insn.h
@@ -0,0 +1,35 @@
+/*
+ * arch/arm64/kernel/probes/decode-insn.h
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _ARM_KERNEL_KPROBES_ARM64_H
+#define _ARM_KERNEL_KPROBES_ARM64_H
+
+/*
+ * ARM strongly recommends a limit of 128 bytes between LoadExcl and
+ * StoreExcl instructions in a single thread of execution. So keep the
+ * max atomic context size as 32.
+ */
+#define MAX_ATOMIC_CONTEXT_SIZE (128 / sizeof(kprobe_opcode_t))
+
+enum kprobe_insn {
+ INSN_REJECTED,
+ INSN_GOOD_NO_SLOT,
+ INSN_GOOD,
+};
+
+enum kprobe_insn __kprobes
+arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi);
+
+#endif /* _ARM_KERNEL_KPROBES_ARM64_H */
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
new file mode 100644
index 000000000000..1ee93c7c5a75
--- /dev/null
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -0,0 +1,657 @@
+/*
+ * arch/arm64/kernel/probes/kprobes.c
+ *
+ * Kprobes support for ARM64
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ * Author: Sandeepa Prabhu <sandeepa.prabhu@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/stringify.h>
+#include <asm/traps.h>
+#include <asm/ptrace.h>
+#include <asm/cacheflush.h>
+#include <asm/debug-monitors.h>
+#include <asm/system_misc.h>
+#include <asm/insn.h>
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+#include <asm-generic/sections.h>
+
+#include "decode-insn.h"
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+static void __kprobes
+post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *);
+
+static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
+{
+ /* prepare insn slot */
+ p->ainsn.insn[0] = cpu_to_le32(p->opcode);
+
+ flush_icache_range((uintptr_t) (p->ainsn.insn),
+ (uintptr_t) (p->ainsn.insn) +
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+
+ /*
+ * Needs restoring of return address after stepping xol.
+ */
+ p->ainsn.restore = (unsigned long) p->addr +
+ sizeof(kprobe_opcode_t);
+}
+
+static void __kprobes arch_prepare_simulate(struct kprobe *p)
+{
+ /* This instructions is not executed xol. No need to adjust the PC */
+ p->ainsn.restore = 0;
+}
+
+static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (p->ainsn.handler)
+ p->ainsn.handler((u32)p->opcode, (long)p->addr, regs);
+
+ /* single step simulated, now go for post processing */
+ post_kprobe_handler(kcb, regs);
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+ unsigned long probe_addr = (unsigned long)p->addr;
+ extern char __start_rodata[];
+ extern char __end_rodata[];
+
+ if (probe_addr & 0x3)
+ return -EINVAL;
+
+ /* copy instruction */
+ p->opcode = le32_to_cpu(*p->addr);
+
+ if (in_exception_text(probe_addr))
+ return -EINVAL;
+ if (probe_addr >= (unsigned long) __start_rodata &&
+ probe_addr <= (unsigned long) __end_rodata)
+ return -EINVAL;
+
+ /* decode instruction */
+ switch (arm_kprobe_decode_insn(p->addr, &p->ainsn)) {
+ case INSN_REJECTED: /* insn not supported */
+ return -EINVAL;
+
+ case INSN_GOOD_NO_SLOT: /* insn need simulation */
+ p->ainsn.insn = NULL;
+ break;
+
+ case INSN_GOOD: /* instruction uses slot */
+ p->ainsn.insn = get_insn_slot();
+ if (!p->ainsn.insn)
+ return -ENOMEM;
+ break;
+ };
+
+ /* prepare the instruction */
+ if (p->ainsn.insn)
+ arch_prepare_ss_slot(p);
+ else
+ arch_prepare_simulate(p);
+
+ return 0;
+}
+
+static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode)
+{
+ void *addrs[1];
+ u32 insns[1];
+
+ addrs[0] = (void *)addr;
+ insns[0] = (u32)opcode;
+
+ return aarch64_insn_patch_text(addrs, insns, 1);
+}
+
+/* arm kprobe: install breakpoint in text */
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+ patch_text(p->addr, BRK64_OPCODE_KPROBES);
+}
+
+/* disarm kprobe: remove breakpoint from text */
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+ patch_text(p->addr, p->opcode);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+ if (p->ainsn.insn) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p)
+{
+ __this_cpu_write(current_kprobe, p);
+}
+
+/*
+ * The D-flag (Debug mask) is set (masked) upon debug exception entry.
+ * Kprobes needs to clear (unmask) D-flag -ONLY- in case of recursive
+ * probe i.e. when probe hit from kprobe handler context upon
+ * executing the pre/post handlers. In this case we return with
+ * D-flag clear so that single-stepping can be carried-out.
+ *
+ * Leave D-flag set in all other cases.
+ */
+static void __kprobes
+spsr_set_debug_flag(struct pt_regs *regs, int mask)
+{
+ unsigned long spsr = regs->pstate;
+
+ if (mask)
+ spsr |= PSR_D_BIT;
+ else
+ spsr &= ~PSR_D_BIT;
+
+ regs->pstate = spsr;
+}
+
+/*
+ * Interrupts need to be disabled before single-step mode is set, and not
+ * reenabled until after single-step mode ends.
+ * Without disabling interrupt on local CPU, there is a chance of
+ * interrupt occurrence in the period of exception return and start of
+ * out-of-line single-step, that result in wrongly single stepping
+ * into the interrupt handler.
+ */
+static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb,
+ struct pt_regs *regs)
+{
+ kcb->saved_irqflag = regs->pstate;
+ regs->pstate |= PSR_I_BIT;
+}
+
+static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb,
+ struct pt_regs *regs)
+{
+ if (kcb->saved_irqflag & PSR_I_BIT)
+ regs->pstate |= PSR_I_BIT;
+ else
+ regs->pstate &= ~PSR_I_BIT;
+}
+
+static void __kprobes
+set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr)
+{
+ kcb->ss_ctx.ss_pending = true;
+ kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t);
+}
+
+static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb)
+{
+ kcb->ss_ctx.ss_pending = false;
+ kcb->ss_ctx.match_addr = 0;
+}
+
+static void __kprobes setup_singlestep(struct kprobe *p,
+ struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
+{
+ unsigned long slot;
+
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else {
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ }
+
+
+ if (p->ainsn.insn) {
+ /* prepare for single stepping */
+ slot = (unsigned long)p->ainsn.insn;
+
+ set_ss_context(kcb, slot); /* mark pending ss */
+
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ spsr_set_debug_flag(regs, 0);
+ else
+ WARN_ON(regs->pstate & PSR_D_BIT);
+
+ /* IRQs and single stepping do not mix well. */
+ kprobes_save_local_irqflag(kcb, regs);
+ kernel_enable_single_step(regs);
+ instruction_pointer_set(regs, slot);
+ } else {
+ /* insn simulation */
+ arch_simulate_insn(p, regs);
+ }
+}
+
+static int __kprobes reenter_kprobe(struct kprobe *p,
+ struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SSDONE:
+ case KPROBE_HIT_ACTIVE:
+ kprobes_inc_nmissed_count(p);
+ setup_singlestep(p, regs, kcb, 1);
+ break;
+ case KPROBE_HIT_SS:
+ case KPROBE_REENTER:
+ pr_warn("Unrecoverable kprobe detected at %p.\n", p->addr);
+ dump_kprobe(p);
+ BUG();
+ break;
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void __kprobes
+post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs)
+{
+ struct kprobe *cur = kprobe_running();
+
+ if (!cur)
+ return;
+
+ /* return addr restore if non-branching insn */
+ if (cur->ainsn.restore != 0)
+ instruction_pointer_set(regs, cur->ainsn.restore);
+
+ /* restore back original saved kprobe variables and continue */
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe(kcb);
+ return;
+ }
+ /* call post handler */
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ if (cur->post_handler) {
+ /* post_handler can hit breakpoint and single step
+ * again, so we enable D-flag for recursive exception.
+ */
+ cur->post_handler(cur, regs, 0);
+ }
+
+ reset_current_kprobe();
+}
+
+int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SS:
+ case KPROBE_REENTER:
+ /*
+ * We are here because the instruction being single
+ * stepped caused a page fault. We reset the current
+ * kprobe and the ip points back to the probe address
+ * and allow the page fault handler to continue as a
+ * normal page fault.
+ */
+ instruction_pointer_set(regs, (unsigned long) cur->addr);
+ if (!instruction_pointer(regs))
+ BUG();
+
+ kernel_disable_single_step();
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ spsr_set_debug_flag(regs, 1);
+
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ restore_previous_kprobe(kcb);
+ else
+ reset_current_kprobe();
+
+ break;
+ case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SSDONE:
+ /*
+ * We increment the nmissed count for accounting,
+ * we can also use npre/npostfault count for accounting
+ * these specific fault cases.
+ */
+ kprobes_inc_nmissed_count(cur);
+
+ /*
+ * We come here because instructions in the pre/post
+ * handler caused the page_fault, this could happen
+ * if handler tries to access user space by
+ * copy_from_user(), get_user() etc. Let the
+ * user-specified handler try to fix it first.
+ */
+ if (cur->fault_handler && cur->fault_handler(cur, regs, fsr))
+ return 1;
+
+ /*
+ * In case the user-specified fault handler returned
+ * zero, try to fix up.
+ */
+ if (fixup_exception(regs))
+ return 1;
+ }
+ return 0;
+}
+
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return NOTIFY_DONE;
+}
+
+static void __kprobes kprobe_handler(struct pt_regs *regs)
+{
+ struct kprobe *p, *cur_kprobe;
+ struct kprobe_ctlblk *kcb;
+ unsigned long addr = instruction_pointer(regs);
+
+ kcb = get_kprobe_ctlblk();
+ cur_kprobe = kprobe_running();
+
+ p = get_kprobe((kprobe_opcode_t *) addr);
+
+ if (p) {
+ if (cur_kprobe) {
+ if (reenter_kprobe(p, regs, kcb))
+ return;
+ } else {
+ /* Probe hit */
+ set_current_kprobe(p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ /*
+ * If we have no pre-handler or it returned 0, we
+ * continue with normal processing. If we have a
+ * pre-handler and it returned non-zero, it prepped
+ * for calling the break_handler below on re-entry,
+ * so get out doing nothing more here.
+ *
+ * pre_handler can hit a breakpoint and can step thru
+ * before return, keep PSTATE D-flag enabled until
+ * pre_handler return back.
+ */
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+ setup_singlestep(p, regs, kcb, 0);
+ return;
+ }
+ }
+ } else if ((le32_to_cpu(*(kprobe_opcode_t *) addr) ==
+ BRK64_OPCODE_KPROBES) && cur_kprobe) {
+ /* We probably hit a jprobe. Call its break handler. */
+ if (cur_kprobe->break_handler &&
+ cur_kprobe->break_handler(cur_kprobe, regs)) {
+ setup_singlestep(cur_kprobe, regs, kcb, 0);
+ return;
+ }
+ }
+ /*
+ * The breakpoint instruction was removed right
+ * after we hit it. Another cpu has removed
+ * either a probepoint or a debugger breakpoint
+ * at this address. In either case, no further
+ * handling of this interrupt is appropriate.
+ * Return back to original instruction, and continue.
+ */
+}
+
+static int __kprobes
+kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr)
+{
+ if ((kcb->ss_ctx.ss_pending)
+ && (kcb->ss_ctx.match_addr == addr)) {
+ clear_ss_context(kcb); /* clear pending ss */
+ return DBG_HOOK_HANDLED;
+ }
+ /* not ours, kprobes should ignore it */
+ return DBG_HOOK_ERROR;
+}
+
+int __kprobes
+kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ int retval;
+
+ /* return error if this is not our step */
+ retval = kprobe_ss_hit(kcb, instruction_pointer(regs));
+
+ if (retval == DBG_HOOK_HANDLED) {
+ kprobes_restore_local_irqflag(kcb, regs);
+ kernel_disable_single_step();
+
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ spsr_set_debug_flag(regs, 1);
+
+ post_kprobe_handler(kcb, regs);
+ }
+
+ return retval;
+}
+
+int __kprobes
+kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr)
+{
+ kprobe_handler(regs);
+ return DBG_HOOK_HANDLED;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct jprobe *jp = container_of(p, struct jprobe, kp);
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ kcb->jprobe_saved_regs = *regs;
+ /*
+ * Since we can't be sure where in the stack frame "stacked"
+ * pass-by-value arguments are stored we just don't try to
+ * duplicate any of the stack. Do not use jprobes on functions that
+ * use more than 64 bytes (after padding each to an 8 byte boundary)
+ * of arguments, or pass individual arguments larger than 16 bytes.
+ */
+
+ instruction_pointer_set(regs, (unsigned long) jp->entry);
+ preempt_disable();
+ pause_graph_tracing();
+ return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ /*
+ * Jprobe handler return by entering break exception,
+ * encoded same as kprobe, but with following conditions
+ * -a special PC to identify it from the other kprobes.
+ * -restore stack addr to original saved pt_regs
+ */
+ asm volatile(" mov sp, %0 \n"
+ "jprobe_return_break: brk %1 \n"
+ :
+ : "r" (kcb->jprobe_saved_regs.sp),
+ "I" (BRK64_ESR_KPROBES)
+ : "memory");
+
+ unreachable();
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ long stack_addr = kcb->jprobe_saved_regs.sp;
+ long orig_sp = kernel_stack_pointer(regs);
+ struct jprobe *jp = container_of(p, struct jprobe, kp);
+ extern const char jprobe_return_break[];
+
+ if (instruction_pointer(regs) != (u64) jprobe_return_break)
+ return 0;
+
+ if (orig_sp != stack_addr) {
+ struct pt_regs *saved_regs =
+ (struct pt_regs *)kcb->jprobe_saved_regs.sp;
+ pr_err("current sp %lx does not match saved sp %lx\n",
+ orig_sp, stack_addr);
+ pr_err("Saved registers for jprobe %p\n", jp);
+ show_regs(saved_regs);
+ pr_err("Current registers\n");
+ show_regs(regs);
+ BUG();
+ }
+ unpause_graph_tracing();
+ *regs = kcb->jprobe_saved_regs;
+ preempt_enable_no_resched();
+ return 1;
+}
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+ extern char __idmap_text_start[], __idmap_text_end[];
+
+ if ((addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end) ||
+ (addr >= (unsigned long)__idmap_text_start &&
+ addr < (unsigned long)__idmap_text_end) ||
+ !!search_exception_tables(addr))
+ return true;
+
+
+ return false;
+}
+
+void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct hlist_head *head, empty_rp;
+ struct hlist_node *tmp;
+ unsigned long flags, orig_ret_address = 0;
+ unsigned long trampoline_address =
+ (unsigned long)&kretprobe_trampoline;
+ kprobe_opcode_t *correct_ret_addr = NULL;
+
+ INIT_HLIST_HEAD(&empty_rp);
+ kretprobe_hash_lock(current, &head, &flags);
+
+ /*
+ * It is possible to have multiple instances associated with a given
+ * task either because multiple functions in the call path have
+ * return probes installed on them, and/or more than one
+ * return probe was registered for a target function.
+ *
+ * We can handle this because:
+ * - instances are always pushed into the head of the list
+ * - when multiple return probes are registered for the same
+ * function, the (chronologically) first instance's ret_addr
+ * will be the real return address, and all the rest will
+ * point to kretprobe_trampoline.
+ */
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+ correct_ret_addr = ri->ret_addr;
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+ if (ri->rp && ri->rp->handler) {
+ __this_cpu_write(current_kprobe, &ri->rp->kp);
+ get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+ ri->ret_addr = correct_ret_addr;
+ ri->rp->handler(ri, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+
+ recycle_rp_inst(ri, &empty_rp);
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_hash_unlock(current, &flags);
+
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
+ return (void *)orig_ret_address;
+}
+
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];
+
+ /* replace return addr (x30) with trampoline */
+ regs->regs[30] = (long)&kretprobe_trampoline;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+ return 0;
+}
+
+int __init arch_init_kprobes(void)
+{
+ return 0;
+}
diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S
new file mode 100644
index 000000000000..5d6e7f14638c
--- /dev/null
+++ b/arch/arm64/kernel/probes/kprobes_trampoline.S
@@ -0,0 +1,81 @@
+/*
+ * trampoline entry and return code for kretprobes.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+
+ .text
+
+ .macro save_all_base_regs
+ stp x0, x1, [sp, #S_X0]
+ stp x2, x3, [sp, #S_X2]
+ stp x4, x5, [sp, #S_X4]
+ stp x6, x7, [sp, #S_X6]
+ stp x8, x9, [sp, #S_X8]
+ stp x10, x11, [sp, #S_X10]
+ stp x12, x13, [sp, #S_X12]
+ stp x14, x15, [sp, #S_X14]
+ stp x16, x17, [sp, #S_X16]
+ stp x18, x19, [sp, #S_X18]
+ stp x20, x21, [sp, #S_X20]
+ stp x22, x23, [sp, #S_X22]
+ stp x24, x25, [sp, #S_X24]
+ stp x26, x27, [sp, #S_X26]
+ stp x28, x29, [sp, #S_X28]
+ add x0, sp, #S_FRAME_SIZE
+ stp lr, x0, [sp, #S_LR]
+ /*
+ * Construct a useful saved PSTATE
+ */
+ mrs x0, nzcv
+ mrs x1, daif
+ orr x0, x0, x1
+ mrs x1, CurrentEL
+ orr x0, x0, x1
+ mrs x1, SPSel
+ orr x0, x0, x1
+ stp xzr, x0, [sp, #S_PC]
+ .endm
+
+ .macro restore_all_base_regs
+ ldr x0, [sp, #S_PSTATE]
+ and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT)
+ msr nzcv, x0
+ ldp x0, x1, [sp, #S_X0]
+ ldp x2, x3, [sp, #S_X2]
+ ldp x4, x5, [sp, #S_X4]
+ ldp x6, x7, [sp, #S_X6]
+ ldp x8, x9, [sp, #S_X8]
+ ldp x10, x11, [sp, #S_X10]
+ ldp x12, x13, [sp, #S_X12]
+ ldp x14, x15, [sp, #S_X14]
+ ldp x16, x17, [sp, #S_X16]
+ ldp x18, x19, [sp, #S_X18]
+ ldp x20, x21, [sp, #S_X20]
+ ldp x22, x23, [sp, #S_X22]
+ ldp x24, x25, [sp, #S_X24]
+ ldp x26, x27, [sp, #S_X26]
+ ldp x28, x29, [sp, #S_X28]
+ .endm
+
+ENTRY(kretprobe_trampoline)
+ sub sp, sp, #S_FRAME_SIZE
+
+ save_all_base_regs
+
+ mov x0, sp
+ bl trampoline_probe_handler
+ /*
+ * Replace trampoline address in lr with actual orig_ret_addr return
+ * address.
+ */
+ mov lr, x0
+
+ restore_all_base_regs
+
+ add sp, sp, #S_FRAME_SIZE
+ ret
+
+ENDPROC(kretprobe_trampoline)
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c
new file mode 100644
index 000000000000..8977ce9d009d
--- /dev/null
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -0,0 +1,217 @@
+/*
+ * arch/arm64/kernel/probes/simulate-insn.c
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+
+#include "simulate-insn.h"
+
+#define sign_extend(x, signbit) \
+ ((x) | (0 - ((x) & (1 << (signbit)))))
+
+#define bbl_displacement(insn) \
+ sign_extend(((insn) & 0x3ffffff) << 2, 27)
+
+#define bcond_displacement(insn) \
+ sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+
+#define cbz_displacement(insn) \
+ sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+
+#define tbz_displacement(insn) \
+ sign_extend(((insn >> 5) & 0x3fff) << 2, 15)
+
+#define ldr_displacement(insn) \
+ sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+
+static inline void set_x_reg(struct pt_regs *regs, int reg, u64 val)
+{
+ if (reg < 31)
+ regs->regs[reg] = val;
+}
+
+static inline void set_w_reg(struct pt_regs *regs, int reg, u64 val)
+{
+ if (reg < 31)
+ regs->regs[reg] = lower_32_bits(val);
+}
+
+static inline u64 get_x_reg(struct pt_regs *regs, int reg)
+{
+ if (reg < 31)
+ return regs->regs[reg];
+ else
+ return 0;
+}
+
+static inline u32 get_w_reg(struct pt_regs *regs, int reg)
+{
+ if (reg < 31)
+ return lower_32_bits(regs->regs[reg]);
+ else
+ return 0;
+}
+
+static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs)
+{
+ int xn = opcode & 0x1f;
+
+ return (opcode & (1 << 31)) ?
+ (get_x_reg(regs, xn) == 0) : (get_w_reg(regs, xn) == 0);
+}
+
+static bool __kprobes check_cbnz(u32 opcode, struct pt_regs *regs)
+{
+ int xn = opcode & 0x1f;
+
+ return (opcode & (1 << 31)) ?
+ (get_x_reg(regs, xn) != 0) : (get_w_reg(regs, xn) != 0);
+}
+
+static bool __kprobes check_tbz(u32 opcode, struct pt_regs *regs)
+{
+ int xn = opcode & 0x1f;
+ int bit_pos = ((opcode & (1 << 31)) >> 26) | ((opcode >> 19) & 0x1f);
+
+ return ((get_x_reg(regs, xn) >> bit_pos) & 0x1) == 0;
+}
+
+static bool __kprobes check_tbnz(u32 opcode, struct pt_regs *regs)
+{
+ int xn = opcode & 0x1f;
+ int bit_pos = ((opcode & (1 << 31)) >> 26) | ((opcode >> 19) & 0x1f);
+
+ return ((get_x_reg(regs, xn) >> bit_pos) & 0x1) != 0;
+}
+
+/*
+ * instruction simulation functions
+ */
+void __kprobes
+simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs)
+{
+ long imm, xn, val;
+
+ xn = opcode & 0x1f;
+ imm = ((opcode >> 3) & 0x1ffffc) | ((opcode >> 29) & 0x3);
+ imm = sign_extend(imm, 20);
+ if (opcode & 0x80000000)
+ val = (imm<<12) + (addr & 0xfffffffffffff000);
+ else
+ val = imm + addr;
+
+ set_x_reg(regs, xn, val);
+
+ instruction_pointer_set(regs, instruction_pointer(regs) + 4);
+}
+
+void __kprobes
+simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs)
+{
+ int disp = bbl_displacement(opcode);
+
+ /* Link register is x30 */
+ if (opcode & (1 << 31))
+ set_x_reg(regs, 30, addr + 4);
+
+ instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs)
+{
+ int disp = 4;
+
+ if (aarch32_opcode_cond_checks[opcode & 0xf](regs->pstate & 0xffffffff))
+ disp = bcond_displacement(opcode);
+
+ instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs)
+{
+ int xn = (opcode >> 5) & 0x1f;
+
+ /* update pc first in case we're doing a "blr lr" */
+ instruction_pointer_set(regs, get_x_reg(regs, xn));
+
+ /* Link register is x30 */
+ if (((opcode >> 21) & 0x3) == 1)
+ set_x_reg(regs, 30, addr + 4);
+}
+
+void __kprobes
+simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs)
+{
+ int disp = 4;
+
+ if (opcode & (1 << 24)) {
+ if (check_cbnz(opcode, regs))
+ disp = cbz_displacement(opcode);
+ } else {
+ if (check_cbz(opcode, regs))
+ disp = cbz_displacement(opcode);
+ }
+ instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs)
+{
+ int disp = 4;
+
+ if (opcode & (1 << 24)) {
+ if (check_tbnz(opcode, regs))
+ disp = tbz_displacement(opcode);
+ } else {
+ if (check_tbz(opcode, regs))
+ disp = tbz_displacement(opcode);
+ }
+ instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs)
+{
+ u64 *load_addr;
+ int xn = opcode & 0x1f;
+ int disp;
+
+ disp = ldr_displacement(opcode);
+ load_addr = (u64 *) (addr + disp);
+
+ if (opcode & (1 << 30)) /* x0-x30 */
+ set_x_reg(regs, xn, *load_addr);
+ else /* w0-w30 */
+ set_w_reg(regs, xn, *load_addr);
+
+ instruction_pointer_set(regs, instruction_pointer(regs) + 4);
+}
+
+void __kprobes
+simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs)
+{
+ s32 *load_addr;
+ int xn = opcode & 0x1f;
+ int disp;
+
+ disp = ldr_displacement(opcode);
+ load_addr = (s32 *) (addr + disp);
+
+ set_x_reg(regs, xn, *load_addr);
+
+ instruction_pointer_set(regs, instruction_pointer(regs) + 4);
+}
diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h
new file mode 100644
index 000000000000..050bde683c2d
--- /dev/null
+++ b/arch/arm64/kernel/probes/simulate-insn.h
@@ -0,0 +1,28 @@
+/*
+ * arch/arm64/kernel/probes/simulate-insn.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _ARM_KERNEL_KPROBES_SIMULATE_INSN_H
+#define _ARM_KERNEL_KPROBES_SIMULATE_INSN_H
+
+void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs);
+
+#endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f75b540bc3b4..80624829db61 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -46,6 +46,7 @@
#include <linux/notifier.h>
#include <trace/events/power.h>
+#include <asm/alternative.h>
#include <asm/compat.h>
#include <asm/cacheflush.h>
#include <asm/fpsimd.h>
@@ -280,6 +281,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
} else {
memset(childregs, 0, sizeof(struct pt_regs));
childregs->pstate = PSR_MODE_EL1h;
+ if (IS_ENABLED(CONFIG_ARM64_UAO) &&
+ cpus_have_cap(ARM64_HAS_UAO))
+ childregs->pstate |= PSR_UAO_BIT;
p->thread.cpu_context.x19 = stack_start;
p->thread.cpu_context.x20 = stk_sz;
}
@@ -308,6 +312,17 @@ static void tls_thread_switch(struct task_struct *next)
: : "r" (tpidr), "r" (tpidrro));
}
+/* Restore the UAO state depending on next's addr_limit */
+static void uao_thread_switch(struct task_struct *next)
+{
+ if (IS_ENABLED(CONFIG_ARM64_UAO)) {
+ if (task_thread_info(next)->addr_limit == KERNEL_DS)
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+ else
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO));
+ }
+}
+
/*
* Thread switching.
*/
@@ -320,6 +335,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
+ uao_thread_switch(next);
/*
* Complete any pending TLB or cache maintenance on this CPU in case
@@ -344,11 +360,14 @@ unsigned long get_wchan(struct task_struct *p)
frame.fp = thread_saved_fp(p);
frame.sp = thread_saved_sp(p);
frame.pc = thread_saved_pc(p);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ frame.graph = p->curr_ret_stack;
+#endif
stack_page = (unsigned long)task_stack_page(p);
do {
if (frame.sp < stack_page ||
frame.sp >= stack_page + THREAD_SIZE ||
- unwind_frame(&frame))
+ unwind_frame(p, &frame))
return 0;
if (!in_sched_functions(frame.pc))
return frame.pc;
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index f67f35b6edb1..42816bebb1e0 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -20,7 +20,6 @@
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/psci.h>
-#include <linux/slab.h>
#include <uapi/linux/psci.h>
@@ -28,73 +27,6 @@
#include <asm/cpu_ops.h>
#include <asm/errno.h>
#include <asm/smp_plat.h>
-#include <asm/suspend.h>
-
-static DEFINE_PER_CPU_READ_MOSTLY(u32 *, psci_power_state);
-
-static int __maybe_unused cpu_psci_cpu_init_idle(unsigned int cpu)
-{
- int i, ret, count = 0;
- u32 *psci_states;
- struct device_node *state_node, *cpu_node;
-
- cpu_node = of_get_cpu_node(cpu, NULL);
- if (!cpu_node)
- return -ENODEV;
-
- /*
- * If the PSCI cpu_suspend function hook has not been initialized
- * idle states must not be enabled, so bail out
- */
- if (!psci_ops.cpu_suspend)
- return -EOPNOTSUPP;
-
- /* Count idle states */
- while ((state_node = of_parse_phandle(cpu_node, "cpu-idle-states",
- count))) {
- count++;
- of_node_put(state_node);
- }
-
- if (!count)
- return -ENODEV;
-
- psci_states = kcalloc(count, sizeof(*psci_states), GFP_KERNEL);
- if (!psci_states)
- return -ENOMEM;
-
- for (i = 0; i < count; i++) {
- u32 state;
-
- state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);
-
- ret = of_property_read_u32(state_node,
- "arm,psci-suspend-param",
- &state);
- if (ret) {
- pr_warn(" * %s missing arm,psci-suspend-param property\n",
- state_node->full_name);
- of_node_put(state_node);
- goto free_mem;
- }
-
- of_node_put(state_node);
- pr_debug("psci-power-state %#x index %d\n", state, i);
- if (!psci_power_state_is_valid(state)) {
- pr_warn("Invalid PSCI power state %#x\n", state);
- ret = -EINVAL;
- goto free_mem;
- }
- psci_states[i] = state;
- }
- /* Idle states parsed correctly, initialize per-cpu pointer */
- per_cpu(psci_power_state, cpu) = psci_states;
- return 0;
-
-free_mem:
- kfree(psci_states);
- return ret;
-}
static int __init cpu_psci_cpu_init(unsigned int cpu)
{
@@ -178,38 +110,11 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
}
#endif
-static int psci_suspend_finisher(unsigned long index)
-{
- u32 *state = __this_cpu_read(psci_power_state);
-
- return psci_ops.cpu_suspend(state[index - 1],
- virt_to_phys(cpu_resume));
-}
-
-static int __maybe_unused cpu_psci_cpu_suspend(unsigned long index)
-{
- int ret;
- u32 *state = __this_cpu_read(psci_power_state);
- /*
- * idle state index 0 corresponds to wfi, should never be called
- * from the cpu_suspend operations
- */
- if (WARN_ON_ONCE(!index))
- return -EINVAL;
-
- if (!psci_power_state_loses_context(state[index - 1]))
- ret = psci_ops.cpu_suspend(state[index - 1], 0);
- else
- ret = cpu_suspend(index, psci_suspend_finisher);
-
- return ret;
-}
-
const struct cpu_operations cpu_psci_ops = {
.name = "psci",
#ifdef CONFIG_CPU_IDLE
- .cpu_init_idle = cpu_psci_cpu_init_idle,
- .cpu_suspend = cpu_psci_cpu_suspend,
+ .cpu_init_idle = psci_cpu_init_idle,
+ .cpu_suspend = psci_cpu_suspend_enter,
#endif
.cpu_init = cpu_psci_cpu_init,
.cpu_prepare = cpu_psci_cpu_prepare,
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 55909b2208cc..c5ef05959813 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -49,6 +49,106 @@
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+#define GPR_OFFSET_NAME(r) \
+ {.name = "x" #r, .offset = offsetof(struct pt_regs, regs[r])}
+
+static const struct pt_regs_offset regoffset_table[] = {
+ GPR_OFFSET_NAME(0),
+ GPR_OFFSET_NAME(1),
+ GPR_OFFSET_NAME(2),
+ GPR_OFFSET_NAME(3),
+ GPR_OFFSET_NAME(4),
+ GPR_OFFSET_NAME(5),
+ GPR_OFFSET_NAME(6),
+ GPR_OFFSET_NAME(7),
+ GPR_OFFSET_NAME(8),
+ GPR_OFFSET_NAME(9),
+ GPR_OFFSET_NAME(10),
+ GPR_OFFSET_NAME(11),
+ GPR_OFFSET_NAME(12),
+ GPR_OFFSET_NAME(13),
+ GPR_OFFSET_NAME(14),
+ GPR_OFFSET_NAME(15),
+ GPR_OFFSET_NAME(16),
+ GPR_OFFSET_NAME(17),
+ GPR_OFFSET_NAME(18),
+ GPR_OFFSET_NAME(19),
+ GPR_OFFSET_NAME(20),
+ GPR_OFFSET_NAME(21),
+ GPR_OFFSET_NAME(22),
+ GPR_OFFSET_NAME(23),
+ GPR_OFFSET_NAME(24),
+ GPR_OFFSET_NAME(25),
+ GPR_OFFSET_NAME(26),
+ GPR_OFFSET_NAME(27),
+ GPR_OFFSET_NAME(28),
+ GPR_OFFSET_NAME(29),
+ GPR_OFFSET_NAME(30),
+ {.name = "lr", .offset = offsetof(struct pt_regs, regs[30])},
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(pc),
+ REG_OFFSET_NAME(pstate),
+ REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
+{
+ return ((addr & ~(THREAD_SIZE - 1)) ==
+ (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return *addr;
+ else
+ return 0;
+}
+
/*
* TODO: does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index 6c4fd2810ecb..1718706fde83 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -43,8 +43,11 @@ void *return_address(unsigned int level)
frame.fp = (unsigned long)__builtin_frame_address(0);
frame.sp = current_stack_pointer;
frame.pc = (unsigned long)return_address; /* dummy */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ frame.graph = current->curr_ret_stack;
+#endif
- walk_stackframe(&frame, save_return_addr, &data);
+ walk_stackframe(current, &frame, save_return_addr, &data);
if (!data.level)
return data.addr;
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 8119479147db..1e33d967c0ae 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -62,6 +62,7 @@
#include <asm/memblock.h>
#include <asm/efi.h>
#include <asm/xen/hypervisor.h>
+#include <asm/mmu_context.h>
phys_addr_t __fdt_pointer __initdata;
@@ -174,7 +175,6 @@ static void __init smp_build_mpidr_hash(void)
*/
if (mpidr_hash_size() > 4 * num_possible_cpus())
pr_warn("Large number of MPIDR hash buckets detected\n");
- __flush_dcache_area(&mpidr_hash, sizeof(struct mpidr_hash));
}
static void __init setup_machine_fdt(phys_addr_t dt_phys)
@@ -313,6 +313,12 @@ void __init setup_arch(char **cmdline_p)
*/
local_async_enable();
+ /*
+ * TTBR0 is only used for the identity mapping at this stage. Make it
+ * point to zero page to avoid speculatively fetching new entries.
+ */
+ cpu_uninstall_idmap();
+
efi_init();
arm64_memblock_init();
@@ -381,3 +387,32 @@ static int __init topology_init(void)
return 0;
}
subsys_initcall(topology_init);
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
+ void *p)
+{
+ u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
+ pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
+ kaslr_offset, KIMAGE_VADDR);
+ } else {
+ pr_emerg("Kernel Offset: disabled\n");
+ }
+ return 0;
+}
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+ return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index f586f7c875e2..c2bf5a58039f 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -49,39 +49,32 @@
orr \dst, \dst, \mask // dst|=(aff3>>rs3)
.endm
/*
- * Save CPU state for a suspend and execute the suspend finisher.
- * On success it will return 0 through cpu_resume - ie through a CPU
- * soft/hard reboot from the reset vector.
- * On failure it returns the suspend finisher return value or force
- * -EOPNOTSUPP if the finisher erroneously returns 0 (the suspend finisher
- * is not allowed to return, if it does this must be considered failure).
- * It saves callee registers, and allocates space on the kernel stack
- * to save the CPU specific registers + some other data for resume.
+ * Save CPU state in the provided sleep_stack_data area, and publish its
+ * location for cpu_resume()'s use in sleep_save_stash.
*
- * x0 = suspend finisher argument
- * x1 = suspend finisher function pointer
+ * cpu_resume() will restore this saved state, and return. Because the
+ * link-register is saved and restored, it will appear to return from this
+ * function. So that the caller can tell the suspend/resume paths apart,
+ * __cpu_suspend_enter() will always return a non-zero value, whereas the
+ * path through cpu_resume() will return 0.
+ *
+ * x0 = struct sleep_stack_data area
*/
ENTRY(__cpu_suspend_enter)
- stp x29, lr, [sp, #-96]!
- stp x19, x20, [sp,#16]
- stp x21, x22, [sp,#32]
- stp x23, x24, [sp,#48]
- stp x25, x26, [sp,#64]
- stp x27, x28, [sp,#80]
- /*
- * Stash suspend finisher and its argument in x20 and x19
- */
- mov x19, x0
- mov x20, x1
+ stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS]
+ stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16]
+ stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32]
+ stp x23, x24, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+48]
+ stp x25, x26, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+64]
+ stp x27, x28, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+80]
+
+ /* save the sp in cpu_suspend_ctx */
mov x2, sp
- sub sp, sp, #CPU_SUSPEND_SZ // allocate cpu_suspend_ctx
- mov x0, sp
- /*
- * x0 now points to struct cpu_suspend_ctx allocated on the stack
- */
- str x2, [x0, #CPU_CTX_SP]
- ldr x1, =sleep_save_sp
- ldr x1, [x1, #SLEEP_SAVE_SP_VIRT]
+ str x2, [x0, #SLEEP_STACK_DATA_SYSTEM_REGS + CPU_CTX_SP]
+
+ /* find the mpidr_hash */
+ ldr x1, =sleep_save_stash
+ ldr x1, [x1]
mrs x7, mpidr_el1
ldr x9, =mpidr_hash
ldr x10, [x9, #MPIDR_HASH_MASK]
@@ -93,70 +86,28 @@ ENTRY(__cpu_suspend_enter)
ldp w5, w6, [x9, #(MPIDR_HASH_SHIFTS + 8)]
compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10
add x1, x1, x8, lsl #3
- bl __cpu_suspend_save
- /*
- * Grab suspend finisher in x20 and its argument in x19
- */
- mov x0, x19
- mov x1, x20
- /*
- * We are ready for power down, fire off the suspend finisher
- * in x1, with argument in x0
- */
- blr x1
- /*
- * Never gets here, unless suspend finisher fails.
- * Successful cpu_suspend should return from cpu_resume, returning
- * through this code path is considered an error
- * If the return value is set to 0 force x0 = -EOPNOTSUPP
- * to make sure a proper error condition is propagated
- */
- cmp x0, #0
- mov x3, #-EOPNOTSUPP
- csel x0, x3, x0, eq
- add sp, sp, #CPU_SUSPEND_SZ // rewind stack pointer
- ldp x19, x20, [sp, #16]
- ldp x21, x22, [sp, #32]
- ldp x23, x24, [sp, #48]
- ldp x25, x26, [sp, #64]
- ldp x27, x28, [sp, #80]
- ldp x29, lr, [sp], #96
+
+ str x0, [x1]
+ add x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
+ stp x29, lr, [sp, #-16]!
+ bl cpu_do_suspend
+ ldp x29, lr, [sp], #16
+ mov x0, #1
ret
ENDPROC(__cpu_suspend_enter)
.ltorg
-/*
- * x0 must contain the sctlr value retrieved from restored context
- */
- .pushsection ".idmap.text", "ax"
-ENTRY(cpu_resume_mmu)
- ldr x3, =cpu_resume_after_mmu
- msr sctlr_el1, x0 // restore sctlr_el1
- isb
- /*
- * Invalidate the local I-cache so that any instructions fetched
- * speculatively from the PoC are discarded, since they may have
- * been dynamically patched at the PoU.
- */
- ic iallu
- dsb nsh
- isb
- br x3 // global jump to virtual address
-ENDPROC(cpu_resume_mmu)
- .popsection
-cpu_resume_after_mmu:
- mov x0, #0 // return zero on success
- ldp x19, x20, [sp, #16]
- ldp x21, x22, [sp, #32]
- ldp x23, x24, [sp, #48]
- ldp x25, x26, [sp, #64]
- ldp x27, x28, [sp, #80]
- ldp x29, lr, [sp], #96
- ret
-ENDPROC(cpu_resume_after_mmu)
-
ENTRY(cpu_resume)
bl el2_setup // if in EL2 drop to EL1 cleanly
+ /* enable the MMU early - so we can access sleep_save_stash by va */
+ adr_l lr, __enable_mmu /* __cpu_setup will return here */
+ ldr x27, =_cpu_resume /* __enable_mmu will branch here */
+ adrp x25, idmap_pg_dir
+ adrp x26, swapper_pg_dir
+ b __cpu_setup
+ENDPROC(cpu_resume)
+
+ENTRY(_cpu_resume)
mrs x1, mpidr_el1
adrp x8, mpidr_hash
add x8, x8, #:lo12:mpidr_hash // x8 = struct mpidr_hash phys address
@@ -166,17 +117,27 @@ ENTRY(cpu_resume)
ldp w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)]
compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2
/* x7 contains hash index, let's use it to grab context pointer */
- ldr_l x0, sleep_save_sp + SLEEP_SAVE_SP_PHYS
+ ldr_l x0, sleep_save_stash
ldr x0, [x0, x7, lsl #3]
+ add x29, x0, #SLEEP_STACK_DATA_CALLEE_REGS
+ add x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
/* load sp from context */
ldr x2, [x0, #CPU_CTX_SP]
- /* load physical address of identity map page table in x1 */
- adrp x1, idmap_pg_dir
mov sp, x2
+ /* save thread_info */
+ and x2, x2, #~(THREAD_SIZE - 1)
+ msr sp_el0, x2
/*
- * cpu_do_resume expects x0 to contain context physical address
- * pointer and x1 to contain physical address of 1:1 page tables
+ * cpu_do_resume expects x0 to contain context address pointer
*/
- bl cpu_do_resume // PC relative jump, MMU off
- b cpu_resume_mmu // Resume MMU, never returns
-ENDPROC(cpu_resume)
+ bl cpu_do_resume
+
+ ldp x19, x20, [x29, #16]
+ ldp x21, x22, [x29, #32]
+ ldp x23, x24, [x29, #48]
+ ldp x25, x26, [x29, #64]
+ ldp x27, x28, [x29, #80]
+ ldp x29, lr, [x29]
+ mov x0, #0
+ ret
+ENDPROC(_cpu_resume)
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
new file mode 100644
index 000000000000..ae0496fa4235
--- /dev/null
+++ b/arch/arm64/kernel/smccc-call.S
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License Version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+ .macro SMCCC instr
+ .cfi_startproc
+ \instr #0
+ ldr x4, [sp]
+ stp x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
+ stp x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS]
+ ret
+ .cfi_endproc
+ .endm
+
+/*
+ * void arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
+ * unsigned long a3, unsigned long a4, unsigned long a5,
+ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_smc)
+ SMCCC smc
+ENDPROC(arm_smccc_smc)
+
+/*
+ * void arm_smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
+ * unsigned long a3, unsigned long a4, unsigned long a5,
+ * unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_hvc)
+ SMCCC hvc
+ENDPROC(arm_smccc_hvc)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index f3c3d8fee5ba..a84623d91410 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -70,6 +70,7 @@ enum ipi_msg_type {
IPI_CPU_STOP,
IPI_TIMER,
IPI_IRQ_WORK,
+ IPI_WAKEUP
};
/*
@@ -149,9 +150,7 @@ asmlinkage void secondary_start_kernel(void)
* TTBR0 is only used for the identity mapping at this stage. Make it
* point to zero page to avoid speculatively fetching new entries.
*/
- cpu_set_reserved_ttbr0();
- local_flush_tlb_all();
- cpu_set_default_tcr_t0sz();
+ cpu_uninstall_idmap();
preempt_disable();
trace_hardirqs_off();
@@ -444,6 +443,17 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
/* map the logical cpu id to cpu MPIDR */
cpu_logical_map(cpu_count) = hwid;
+ /*
+ * Set-up the ACPI parking protocol cpu entries
+ * while initializing the cpu_logical_map to
+ * avoid parsing MADT entries multiple times for
+ * nothing (ie a valid cpu_logical_map entry should
+ * contain a valid parking protocol data set to
+ * initialize the cpu if the parking protocol is
+ * the only available enable method).
+ */
+ acpi_set_mailbox_entry(cpu_count, processor);
+
cpu_count++;
}
@@ -626,6 +636,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
S(IPI_CPU_STOP, "CPU stop interrupts"),
S(IPI_TIMER, "Timer broadcast interrupts"),
S(IPI_IRQ_WORK, "IRQ work interrupts"),
+ S(IPI_WAKEUP, "CPU wake-up interrupts"),
};
static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
@@ -669,6 +680,13 @@ void arch_send_call_function_single_ipi(int cpu)
smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC);
}
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+{
+ smp_cross_call(mask, IPI_WAKEUP);
+}
+#endif
+
#ifdef CONFIG_IRQ_WORK
void arch_irq_work_raise(void)
{
@@ -746,6 +764,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
break;
#endif
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+ case IPI_WAKEUP:
+ WARN_ONCE(!acpi_parking_protocol_valid(cpu),
+ "CPU%u: Wake-up IPI outside the ACPI parking protocol\n",
+ cpu);
+ break;
+#endif
+
default:
pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
break;
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ccb6078ed9f2..cfd46c227c8c 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -17,9 +17,11 @@
*/
#include <linux/kernel.h>
#include <linux/export.h>
+#include <linux/ftrace.h>
#include <linux/sched.h>
#include <linux/stacktrace.h>
+#include <asm/irq.h>
#include <asm/stacktrace.h>
/*
@@ -35,25 +37,82 @@
* ldp x29, x30, [sp]
* add sp, sp, #0x10
*/
-int notrace unwind_frame(struct stackframe *frame)
+int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
{
unsigned long high, low;
unsigned long fp = frame->fp;
+ unsigned long irq_stack_ptr;
+
+ /*
+ * Switching between stacks is valid when tracing current and in
+ * non-preemptible context.
+ */
+ if (tsk == current && !preemptible())
+ irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+ else
+ irq_stack_ptr = 0;
low = frame->sp;
- high = ALIGN(low, THREAD_SIZE);
+ /* irq stacks are not THREAD_SIZE aligned */
+ if (on_irq_stack(frame->sp, raw_smp_processor_id()))
+ high = irq_stack_ptr;
+ else
+ high = ALIGN(low, THREAD_SIZE) - 0x20;
- if (fp < low || fp > high - 0x18 || fp & 0xf)
+ if (fp < low || fp > high || fp & 0xf)
return -EINVAL;
frame->sp = fp + 0x10;
frame->fp = *(unsigned long *)(fp);
frame->pc = *(unsigned long *)(fp + 8);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (tsk && tsk->ret_stack &&
+ (frame->pc == (unsigned long)return_to_handler)) {
+ /*
+ * This is a case where function graph tracer has
+ * modified a return address (LR) in a stack frame
+ * to hook a function return.
+ * So replace it to an original value.
+ */
+ frame->pc = tsk->ret_stack[frame->graph--].ret;
+ }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+ /*
+ * Check whether we are going to walk through from interrupt stack
+ * to task stack.
+ * If we reach the end of the stack - and its an interrupt stack,
+ * unpack the dummy frame to find the original elr.
+ *
+ * Check the frame->fp we read from the bottom of the irq_stack,
+ * and the original task stack pointer are both in current->stack.
+ */
+ if (frame->sp == irq_stack_ptr) {
+ struct pt_regs *irq_args;
+ unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+
+ if (object_is_on_stack((void *)orig_sp) &&
+ object_is_on_stack((void *)frame->fp)) {
+ frame->sp = orig_sp;
+
+ /* orig_sp is the saved pt_regs, find the elr */
+ irq_args = (struct pt_regs *)orig_sp;
+ frame->pc = irq_args->pc;
+ } else {
+ /*
+ * This frame has a non-standard format, and we
+ * didn't fix it, because the data looked wrong.
+ * Refuse to output this frame.
+ */
+ return -EINVAL;
+ }
+ }
+
return 0;
}
-void notrace walk_stackframe(struct stackframe *frame,
+void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
int (*fn)(struct stackframe *, void *), void *data)
{
while (1) {
@@ -61,7 +120,7 @@ void notrace walk_stackframe(struct stackframe *frame,
if (fn(frame, data))
break;
- ret = unwind_frame(frame);
+ ret = unwind_frame(tsk, frame);
if (ret < 0)
break;
}
@@ -112,8 +171,11 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
frame.sp = current_stack_pointer;
frame.pc = (unsigned long)save_stack_trace_tsk;
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ frame.graph = tsk->curr_ret_stack;
+#endif
- walk_stackframe(&frame, save_trace, &data);
+ walk_stackframe(tsk, &frame, save_trace, &data);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 00c1372bf57b..5a0b1088c17c 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -12,30 +12,11 @@
#include <asm/suspend.h>
#include <asm/tlbflush.h>
-extern int __cpu_suspend_enter(unsigned long arg, int (*fn)(unsigned long));
/*
- * This is called by __cpu_suspend_enter() to save the state, and do whatever
- * flushing is required to ensure that when the CPU goes to sleep we have
- * the necessary data available when the caches are not searched.
- *
- * ptr: CPU context virtual address
- * save_ptr: address of the location where the context physical address
- * must be saved
+ * This is allocated by cpu_suspend_init(), and used to store a pointer to
+ * the 'struct sleep_stack_data' the contains a particular CPUs state.
*/
-void notrace __cpu_suspend_save(struct cpu_suspend_ctx *ptr,
- phys_addr_t *save_ptr)
-{
- *save_ptr = virt_to_phys(ptr);
-
- cpu_do_suspend(ptr);
- /*
- * Only flush the context that must be retrieved with the MMU
- * off. VA primitives ensure the flush is applied to all
- * cache levels so context is pushed to DRAM.
- */
- __flush_dcache_area(ptr, sizeof(*ptr));
- __flush_dcache_area(save_ptr, sizeof(*save_ptr));
-}
+unsigned long *sleep_save_stash;
/*
* This hook is provided so that cpu_suspend code can restore HW
@@ -53,6 +34,30 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
hw_breakpoint_restore = hw_bp_restore;
}
+void notrace __cpu_suspend_exit(void)
+{
+ /*
+ * We are resuming from reset with the idmap active in TTBR0_EL1.
+ * We must uninstall the idmap and restore the expected MMU
+ * state before we can possibly return to userspace.
+ */
+ cpu_uninstall_idmap();
+
+ /*
+ * Restore per-cpu offset before any kernel
+ * subsystem relying on it has a chance to run.
+ */
+ set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+
+ /*
+ * Restore HW breakpoint registers to sane values
+ * before debug exceptions are possibly reenabled
+ * through local_dbg_restore.
+ */
+ if (hw_breakpoint_restore)
+ hw_breakpoint_restore(NULL);
+}
+
/*
* cpu_suspend
*
@@ -62,9 +67,9 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
*/
int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
{
- struct mm_struct *mm = current->active_mm;
- int ret;
+ int ret = 0;
unsigned long flags;
+ struct sleep_stack_data state;
/*
* From this point debug exceptions are disabled to prevent
@@ -80,37 +85,9 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
*/
pause_graph_tracing();
- /*
- * mm context saved on the stack, it will be restored when
- * the cpu comes out of reset through the identity mapped
- * page tables, so that the thread address space is properly
- * set-up on function return.
- */
- ret = __cpu_suspend_enter(arg, fn);
- if (ret == 0) {
- /*
- * We are resuming from reset with TTBR0_EL1 set to the
- * idmap to enable the MMU; set the TTBR0 to the reserved
- * page tables to prevent speculative TLB allocations, flush
- * the local tlb and set the default tcr_el1.t0sz so that
- * the TTBR0 address space set-up is properly restored.
- * If the current active_mm != &init_mm we entered cpu_suspend
- * with mappings in TTBR0 that must be restored, so we switch
- * them back to complete the address space configuration
- * restoration before returning.
- */
- cpu_set_reserved_ttbr0();
- local_flush_tlb_all();
- cpu_set_default_tcr_t0sz();
-
- if (mm != &init_mm)
- cpu_switch_mm(mm->pgd, mm);
-
- /*
- * Restore per-cpu offset before any kernel
- * subsystem relying on it has a chance to run.
- */
- set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+ if (__cpu_suspend_enter(&state)) {
+ /* Call the suspend finisher */
+ ret = fn(arg);
/*
* PSTATE was not saved over suspend/resume, re-enable any
@@ -124,8 +101,10 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
* before debug exceptions are possibly reenabled
* through local_dbg_restore.
*/
- if (hw_breakpoint_restore)
- hw_breakpoint_restore(NULL);
+ if (!ret)
+ ret = -EOPNOTSUPP;
+ } else {
+ __cpu_suspend_exit();
}
unpause_graph_tracing();
@@ -140,22 +119,15 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
return ret;
}
-struct sleep_save_sp sleep_save_sp;
-
static int __init cpu_suspend_init(void)
{
- void *ctx_ptr;
-
/* ctx_ptr is an array of physical addresses */
- ctx_ptr = kcalloc(mpidr_hash_size(), sizeof(phys_addr_t), GFP_KERNEL);
+ sleep_save_stash = kcalloc(mpidr_hash_size(), sizeof(*sleep_save_stash),
+ GFP_KERNEL);
- if (WARN_ON(!ctx_ptr))
+ if (WARN_ON(!sleep_save_stash))
return -ENOMEM;
- sleep_save_sp.save_ptr_stash = ctx_ptr;
- sleep_save_sp.save_ptr_stash_phys = virt_to_phys(ctx_ptr);
- __flush_dcache_area(&sleep_save_sp, sizeof(struct sleep_save_sp));
-
return 0;
}
early_initcall(cpu_suspend_init);
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 13339b6ffc1a..59779699a1a4 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -52,8 +52,11 @@ unsigned long profile_pc(struct pt_regs *regs)
frame.fp = regs->regs[29];
frame.sp = regs->sp;
frame.pc = regs->pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ frame.graph = -1; /* no task info */
+#endif
do {
- int ret = unwind_frame(&frame);
+ int ret = unwind_frame(NULL, &frame);
if (ret < 0)
return 0;
} while (in_lock_functions(frame.pc));
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index ca7f0ac5f708..a036ff290d69 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -146,17 +146,24 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
{
struct stackframe frame;
+ unsigned long irq_stack_ptr;
+ int skip;
+
+ /*
+ * Switching between stacks is valid when tracing current and in
+ * non-preemptible context.
+ */
+ if (tsk == current && !preemptible())
+ irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+ else
+ irq_stack_ptr = 0;
pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
if (!tsk)
tsk = current;
- if (regs) {
- frame.fp = regs->regs[29];
- frame.sp = regs->sp;
- frame.pc = regs->pc;
- } else if (tsk == current) {
+ if (tsk == current) {
frame.fp = (unsigned long)__builtin_frame_address(0);
frame.sp = current_stack_pointer;
frame.pc = (unsigned long)dump_backtrace;
@@ -168,21 +175,49 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
frame.sp = thread_saved_sp(tsk);
frame.pc = thread_saved_pc(tsk);
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ frame.graph = tsk->curr_ret_stack;
+#endif
- pr_emerg("Call trace:\n");
+ skip = !!regs;
+ printk("Call trace:\n");
while (1) {
unsigned long where = frame.pc;
unsigned long stack;
int ret;
- dump_backtrace_entry(where);
- ret = unwind_frame(&frame);
+ /* skip until specified stack frame */
+ if (!skip) {
+ dump_backtrace_entry(where);
+ } else if (frame.fp == regs->regs[29]) {
+ skip = 0;
+ /*
+ * Mostly, this is the case where this function is
+ * called in panic/abort. As exception handler's
+ * stack frame does not contain the corresponding pc
+ * at which an exception has taken place, use regs->pc
+ * instead.
+ */
+ dump_backtrace_entry(regs->pc);
+ }
+ ret = unwind_frame(tsk, &frame);
if (ret < 0)
break;
stack = frame.sp;
- if (in_exception_text(where))
+ if (in_exception_text(where)) {
+ /*
+ * If we switched to the irq_stack before calling this
+ * exception handler, then the pt_regs will be on the
+ * task stack. The easiest way to tell is if the large
+ * pt_regs would overlap with the end of the irq_stack.
+ */
+ if (stack < irq_stack_ptr &&
+ (stack + sizeof(struct pt_regs)) > irq_stack_ptr)
+ stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+
dump_mem("", "Exception stack", stack,
stack + sizeof(struct pt_regs), false);
+ }
}
}
@@ -476,22 +511,22 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
void __pte_error(const char *file, int line, unsigned long val)
{
- pr_crit("%s:%d: bad pte %016lx.\n", file, line, val);
+ pr_err("%s:%d: bad pte %016lx.\n", file, line, val);
}
void __pmd_error(const char *file, int line, unsigned long val)
{
- pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val);
+ pr_err("%s:%d: bad pmd %016lx.\n", file, line, val);
}
void __pud_error(const char *file, int line, unsigned long val)
{
- pr_crit("%s:%d: bad pud %016lx.\n", file, line, val);
+ pr_err("%s:%d: bad pud %016lx.\n", file, line, val);
}
void __pgd_error(const char *file, int line, unsigned long val)
{
- pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val);
+ pr_err("%s:%d: bad pgd %016lx.\n", file, line, val);
}
/* GENERIC_BUG traps */
diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S
index 60c1db54b41a..82379a70ef03 100644
--- a/arch/arm64/kernel/vdso/vdso.S
+++ b/arch/arm64/kernel/vdso/vdso.S
@@ -21,9 +21,8 @@
#include <linux/const.h>
#include <asm/page.h>
- __PAGE_ALIGNED_DATA
-
.globl vdso_start, vdso_end
+ .section .rodata
.balign PAGE_SIZE
vdso_start:
.incbin "arch/arm64/kernel/vdso/vdso.so"
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 71426a78db12..623532f44323 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -46,6 +46,16 @@ jiffies = jiffies_64;
*(.idmap.text) \
VMLINUX_SYMBOL(__idmap_text_end) = .;
+#ifdef CONFIG_HIBERNATION
+#define HIBERNATE_TEXT \
+ . = ALIGN(SZ_4K); \
+ VMLINUX_SYMBOL(__hibernate_exit_text_start) = .;\
+ *(.hibernate_exit.text) \
+ VMLINUX_SYMBOL(__hibernate_exit_text_end) = .;
+#else
+#define HIBERNATE_TEXT
+#endif
+
/*
* The size of the PE/COFF section that covers the kernel image, which
* runs from stext to _edata, must be a round multiple of the PE/COFF
@@ -63,14 +73,19 @@ PECOFF_FILE_ALIGNMENT = 0x200;
#endif
#if defined(CONFIG_DEBUG_ALIGN_RODATA)
-#define ALIGN_DEBUG_RO . = ALIGN(1<<SECTION_SHIFT);
-#define ALIGN_DEBUG_RO_MIN(min) ALIGN_DEBUG_RO
-#elif defined(CONFIG_DEBUG_RODATA)
-#define ALIGN_DEBUG_RO . = ALIGN(1<<PAGE_SHIFT);
-#define ALIGN_DEBUG_RO_MIN(min) ALIGN_DEBUG_RO
+/*
+ * 4 KB granule: 1 level 2 entry
+ * 16 KB granule: 128 level 3 entries, with contiguous bit
+ * 64 KB granule: 32 level 3 entries, with contiguous bit
+ */
+#define SEGMENT_ALIGN SZ_2M
#else
-#define ALIGN_DEBUG_RO
-#define ALIGN_DEBUG_RO_MIN(min) . = ALIGN(min);
+/*
+ * 4 KB granule: 16 level 3 entries, with contiguous bit
+ * 16 KB granule: 4 level 3 entries, without contiguous bit
+ * 64 KB granule: 1 level 3 entry
+ */
+#define SEGMENT_ALIGN SZ_64K
#endif
SECTIONS
@@ -87,40 +102,43 @@ SECTIONS
EXIT_CALL
*(.discard)
*(.discard.*)
+ *(.interp .dynamic)
+ *(.dynsym .dynstr .hash)
}
- . = PAGE_OFFSET + TEXT_OFFSET;
+ . = KIMAGE_VADDR + TEXT_OFFSET;
.head.text : {
_text = .;
HEAD_TEXT
}
- ALIGN_DEBUG_RO
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
__exception_text_start = .;
*(.exception.text)
__exception_text_end = .;
IRQENTRY_TEXT
+ ENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
+ KPROBES_TEXT
HYPERVISOR_TEXT
IDMAP_TEXT
+ HIBERNATE_TEXT
*(.fixup)
*(.gnu.warning)
. = ALIGN(16);
*(.got) /* Global offset table */
}
- ALIGN_DEBUG_RO
- RO_DATA(PAGE_SIZE)
- EXCEPTION_TABLE(8)
+ . = ALIGN(SEGMENT_ALIGN);
+ RO_DATA(PAGE_SIZE) /* everything from this point to */
+ EXCEPTION_TABLE(8) /* _etext will be marked RO NX */
NOTES
- ALIGN_DEBUG_RO
- _etext = .; /* End of text and rodata section */
- ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
+ . = ALIGN(SEGMENT_ALIGN);
+ _etext = .; /* End of text and rodata section */
__init_begin = .;
INIT_TEXT_SECTION(8)
@@ -128,7 +146,6 @@ SECTIONS
ARM_EXIT_KEEP(EXIT_TEXT)
}
- ALIGN_DEBUG_RO_MIN(16)
.init.data : {
INIT_DATA
INIT_SETUP(16)
@@ -143,9 +160,6 @@ SECTIONS
PERCPU_SECTION(L1_CACHE_BYTES)
- . = ALIGN(PAGE_SIZE);
- __init_end = .;
-
. = ALIGN(4);
.altinstructions : {
__alt_instructions = .;
@@ -155,8 +169,16 @@ SECTIONS
.altinstr_replacement : {
*(.altinstr_replacement)
}
+ .rela : ALIGN(8) {
+ *(.rela .rela*)
+ }
+
+ __rela_offset = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR);
+ __rela_size = SIZEOF(.rela);
+
+ . = ALIGN(SEGMENT_ALIGN);
+ __init_end = .;
- . = ALIGN(PAGE_SIZE);
_data = .;
_sdata = .;
RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
@@ -186,8 +208,12 @@ ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"HYP init code too big or misaligned")
ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"ID map text too big or misaligned")
+#ifdef CONFIG_HIBERNATION
+ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
+ <= SZ_4K, "Hibernate exit text too big or misaligned")
+#endif
/*
* If padding is applied before .head.text, virt<->phys conversions will fail.
*/
-ASSERT(_text == (PAGE_OFFSET + TEXT_OFFSET), "HEAD is misaligned")
+ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned")
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 1949fe5f5424..caee9ee8e12a 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -10,6 +10,7 @@ KVM=../../../virt/kvm
ARM=../../../arch/arm/kvm
obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
+obj-$(CONFIG_KVM_ARM_HOST) += hyp/
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
@@ -22,8 +23,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generi
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v2-switch.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v3-switch.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 3039f080e2d5..e5ee8880d5d9 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -28,7 +28,6 @@
#include <asm/cputype.h>
#include <asm/uaccess.h>
#include <asm/kvm.h>
-#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_coproc.h>
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 15f0477b0d2a..25006a7a5316 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -23,6 +23,7 @@
#include <linux/kvm_host.h>
#include <asm/esr.h>
+#include <asm/kvm_asm.h>
#include <asm/kvm_coproc.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
@@ -182,6 +183,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
exit_handler = kvm_get_exit_handler(vcpu);
return exit_handler(vcpu, run);
+ case ARM_EXCEPTION_HYP_GONE:
+ /*
+ * EL2 has been reset to the hyp-stub. This happens when a guest
+ * is pre-empted by kvm_reboot()'s shutdown call.
+ */
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ return 0;
default:
kvm_pr_unimpl("Unsupported exception type: %d",
exception_index);
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 84c338f017b2..d87635e678b7 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -21,6 +21,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/pgtable-hwdef.h>
+#include <asm/sysreg.h>
.text
.pushsection .hyp.idmap.text, "ax"
@@ -96,6 +97,14 @@ __do_hyp_init:
ldr x4, =VTCR_EL2_FLAGS
bfi x4, x5, #16, #3
+ /*
+ * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS bit in
+ * VTCR_EL2.
+ */
+ mrs x5, ID_AA64MMFR1_EL1
+ ubfx x5, x5, #5, #1
+ lsl x5, x5, #VTCR_EL2_VS
+ orr x4, x4, x5
msr vtcr_el2, x4
@@ -108,8 +117,8 @@ __do_hyp_init:
dsb sy
mrs x4, sctlr_el2
- and x4, x4, #SCTLR_EL2_EE // preserve endianness of EL2
- ldr x5, =SCTLR_EL2_FLAGS
+ and x4, x4, #SCTLR_ELx_EE // preserve endianness of EL2
+ ldr x5, =SCTLR_ELx_FLAGS
orr x4, x4, x5
msr sctlr_el2, x4
isb
@@ -143,6 +152,44 @@ merged:
eret
ENDPROC(__kvm_hyp_init)
+ /*
+ * x0: HYP boot pgd
+ * x1: HYP phys_idmap_start
+ */
+ENTRY(__kvm_hyp_reset)
+ /* We're in trampoline code in VA, switch back to boot page tables */
+ msr ttbr0_el2, x0
+ isb
+
+ /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
+ ic iallu
+ tlbi alle2
+ dsb sy
+ isb
+
+ /* Branch into PA space */
+ adr x0, 1f
+ bfi x1, x0, #0, #PAGE_SHIFT
+ br x1
+
+ /* We're now in idmap, disable MMU */
+1: mrs x0, sctlr_el2
+ ldr x1, =SCTLR_ELx_FLAGS
+ bic x0, x0, x1 // Clear SCTL_M and etc
+ msr sctlr_el2, x0
+ isb
+
+ /* Invalidate the old TLBs */
+ tlbi alle2
+ dsb sy
+
+ /* Install stub vectors */
+ adr_l x0, __hyp_stub_vectors
+ msr vbar_el2, x0
+
+ eret
+ENDPROC(__kvm_hyp_reset)
+
.ltorg
.popsection
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 86c289832272..7ce931565151 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -18,912 +18,11 @@
#include <linux/linkage.h>
#include <asm/alternative.h>
-#include <asm/asm-offsets.h>
#include <asm/assembler.h>
#include <asm/cpufeature.h>
-#include <asm/debug-monitors.h>
-#include <asm/esr.h>
-#include <asm/fpsimdmacros.h>
-#include <asm/kvm.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/memory.h>
-
-#define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x)
-#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
-#define CPU_SPSR_OFFSET(x) CPU_GP_REG_OFFSET(CPU_SPSR + 8*x)
-#define CPU_SYSREG_OFFSET(x) (CPU_SYSREGS + 8*x)
-
- .text
- .pushsection .hyp.text, "ax"
- .align PAGE_SHIFT
-
-.macro save_common_regs
- // x2: base address for cpu context
- // x3: tmp register
-
- add x3, x2, #CPU_XREG_OFFSET(19)
- stp x19, x20, [x3]
- stp x21, x22, [x3, #16]
- stp x23, x24, [x3, #32]
- stp x25, x26, [x3, #48]
- stp x27, x28, [x3, #64]
- stp x29, lr, [x3, #80]
-
- mrs x19, sp_el0
- mrs x20, elr_el2 // pc before entering el2
- mrs x21, spsr_el2 // pstate before entering el2
-
- stp x19, x20, [x3, #96]
- str x21, [x3, #112]
-
- mrs x22, sp_el1
- mrs x23, elr_el1
- mrs x24, spsr_el1
-
- str x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
- str x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)]
- str x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)]
-.endm
-
-.macro restore_common_regs
- // x2: base address for cpu context
- // x3: tmp register
-
- ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
- ldr x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)]
- ldr x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)]
-
- msr sp_el1, x22
- msr elr_el1, x23
- msr spsr_el1, x24
-
- add x3, x2, #CPU_XREG_OFFSET(31) // SP_EL0
- ldp x19, x20, [x3]
- ldr x21, [x3, #16]
-
- msr sp_el0, x19
- msr elr_el2, x20 // pc on return from el2
- msr spsr_el2, x21 // pstate on return from el2
-
- add x3, x2, #CPU_XREG_OFFSET(19)
- ldp x19, x20, [x3]
- ldp x21, x22, [x3, #16]
- ldp x23, x24, [x3, #32]
- ldp x25, x26, [x3, #48]
- ldp x27, x28, [x3, #64]
- ldp x29, lr, [x3, #80]
-.endm
-
-.macro save_host_regs
- save_common_regs
-.endm
-
-.macro restore_host_regs
- restore_common_regs
-.endm
-
-.macro save_fpsimd
- // x2: cpu context address
- // x3, x4: tmp regs
- add x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
- fpsimd_save x3, 4
-.endm
-
-.macro restore_fpsimd
- // x2: cpu context address
- // x3, x4: tmp regs
- add x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
- fpsimd_restore x3, 4
-.endm
-
-.macro save_guest_regs
- // x0 is the vcpu address
- // x1 is the return code, do not corrupt!
- // x2 is the cpu context
- // x3 is a tmp register
- // Guest's x0-x3 are on the stack
-
- // Compute base to save registers
- add x3, x2, #CPU_XREG_OFFSET(4)
- stp x4, x5, [x3]
- stp x6, x7, [x3, #16]
- stp x8, x9, [x3, #32]
- stp x10, x11, [x3, #48]
- stp x12, x13, [x3, #64]
- stp x14, x15, [x3, #80]
- stp x16, x17, [x3, #96]
- str x18, [x3, #112]
-
- pop x6, x7 // x2, x3
- pop x4, x5 // x0, x1
-
- add x3, x2, #CPU_XREG_OFFSET(0)
- stp x4, x5, [x3]
- stp x6, x7, [x3, #16]
-
- save_common_regs
-.endm
-
-.macro restore_guest_regs
- // x0 is the vcpu address.
- // x2 is the cpu context
- // x3 is a tmp register
-
- // Prepare x0-x3 for later restore
- add x3, x2, #CPU_XREG_OFFSET(0)
- ldp x4, x5, [x3]
- ldp x6, x7, [x3, #16]
- push x4, x5 // Push x0-x3 on the stack
- push x6, x7
-
- // x4-x18
- ldp x4, x5, [x3, #32]
- ldp x6, x7, [x3, #48]
- ldp x8, x9, [x3, #64]
- ldp x10, x11, [x3, #80]
- ldp x12, x13, [x3, #96]
- ldp x14, x15, [x3, #112]
- ldp x16, x17, [x3, #128]
- ldr x18, [x3, #144]
-
- // x19-x29, lr, sp*, elr*, spsr*
- restore_common_regs
-
- // Last bits of the 64bit state
- pop x2, x3
- pop x0, x1
-
- // Do not touch any register after this!
-.endm
-
-/*
- * Macros to perform system register save/restore.
- *
- * Ordering here is absolutely critical, and must be kept consistent
- * in {save,restore}_sysregs, {save,restore}_guest_32bit_state,
- * and in kvm_asm.h.
- *
- * In other words, don't touch any of these unless you know what
- * you are doing.
- */
-.macro save_sysregs
- // x2: base address for cpu context
- // x3: tmp register
-
- add x3, x2, #CPU_SYSREG_OFFSET(MPIDR_EL1)
-
- mrs x4, vmpidr_el2
- mrs x5, csselr_el1
- mrs x6, sctlr_el1
- mrs x7, actlr_el1
- mrs x8, cpacr_el1
- mrs x9, ttbr0_el1
- mrs x10, ttbr1_el1
- mrs x11, tcr_el1
- mrs x12, esr_el1
- mrs x13, afsr0_el1
- mrs x14, afsr1_el1
- mrs x15, far_el1
- mrs x16, mair_el1
- mrs x17, vbar_el1
- mrs x18, contextidr_el1
- mrs x19, tpidr_el0
- mrs x20, tpidrro_el0
- mrs x21, tpidr_el1
- mrs x22, amair_el1
- mrs x23, cntkctl_el1
- mrs x24, par_el1
- mrs x25, mdscr_el1
-
- stp x4, x5, [x3]
- stp x6, x7, [x3, #16]
- stp x8, x9, [x3, #32]
- stp x10, x11, [x3, #48]
- stp x12, x13, [x3, #64]
- stp x14, x15, [x3, #80]
- stp x16, x17, [x3, #96]
- stp x18, x19, [x3, #112]
- stp x20, x21, [x3, #128]
- stp x22, x23, [x3, #144]
- stp x24, x25, [x3, #160]
-.endm
-
-.macro save_debug type
- // x4: pointer to register set
- // x5: number of registers to skip
- // x6..x22 trashed
-
- adr x22, 1f
- add x22, x22, x5, lsl #2
- br x22
-1:
- mrs x21, \type\()15_el1
- mrs x20, \type\()14_el1
- mrs x19, \type\()13_el1
- mrs x18, \type\()12_el1
- mrs x17, \type\()11_el1
- mrs x16, \type\()10_el1
- mrs x15, \type\()9_el1
- mrs x14, \type\()8_el1
- mrs x13, \type\()7_el1
- mrs x12, \type\()6_el1
- mrs x11, \type\()5_el1
- mrs x10, \type\()4_el1
- mrs x9, \type\()3_el1
- mrs x8, \type\()2_el1
- mrs x7, \type\()1_el1
- mrs x6, \type\()0_el1
-
- adr x22, 1f
- add x22, x22, x5, lsl #2
- br x22
-1:
- str x21, [x4, #(15 * 8)]
- str x20, [x4, #(14 * 8)]
- str x19, [x4, #(13 * 8)]
- str x18, [x4, #(12 * 8)]
- str x17, [x4, #(11 * 8)]
- str x16, [x4, #(10 * 8)]
- str x15, [x4, #(9 * 8)]
- str x14, [x4, #(8 * 8)]
- str x13, [x4, #(7 * 8)]
- str x12, [x4, #(6 * 8)]
- str x11, [x4, #(5 * 8)]
- str x10, [x4, #(4 * 8)]
- str x9, [x4, #(3 * 8)]
- str x8, [x4, #(2 * 8)]
- str x7, [x4, #(1 * 8)]
- str x6, [x4, #(0 * 8)]
-.endm
-
-.macro restore_sysregs
- // x2: base address for cpu context
- // x3: tmp register
-
- add x3, x2, #CPU_SYSREG_OFFSET(MPIDR_EL1)
-
- ldp x4, x5, [x3]
- ldp x6, x7, [x3, #16]
- ldp x8, x9, [x3, #32]
- ldp x10, x11, [x3, #48]
- ldp x12, x13, [x3, #64]
- ldp x14, x15, [x3, #80]
- ldp x16, x17, [x3, #96]
- ldp x18, x19, [x3, #112]
- ldp x20, x21, [x3, #128]
- ldp x22, x23, [x3, #144]
- ldp x24, x25, [x3, #160]
-
- msr vmpidr_el2, x4
- msr csselr_el1, x5
- msr sctlr_el1, x6
- msr actlr_el1, x7
- msr cpacr_el1, x8
- msr ttbr0_el1, x9
- msr ttbr1_el1, x10
- msr tcr_el1, x11
- msr esr_el1, x12
- msr afsr0_el1, x13
- msr afsr1_el1, x14
- msr far_el1, x15
- msr mair_el1, x16
- msr vbar_el1, x17
- msr contextidr_el1, x18
- msr tpidr_el0, x19
- msr tpidrro_el0, x20
- msr tpidr_el1, x21
- msr amair_el1, x22
- msr cntkctl_el1, x23
- msr par_el1, x24
- msr mdscr_el1, x25
-.endm
-
-.macro restore_debug type
- // x4: pointer to register set
- // x5: number of registers to skip
- // x6..x22 trashed
-
- adr x22, 1f
- add x22, x22, x5, lsl #2
- br x22
-1:
- ldr x21, [x4, #(15 * 8)]
- ldr x20, [x4, #(14 * 8)]
- ldr x19, [x4, #(13 * 8)]
- ldr x18, [x4, #(12 * 8)]
- ldr x17, [x4, #(11 * 8)]
- ldr x16, [x4, #(10 * 8)]
- ldr x15, [x4, #(9 * 8)]
- ldr x14, [x4, #(8 * 8)]
- ldr x13, [x4, #(7 * 8)]
- ldr x12, [x4, #(6 * 8)]
- ldr x11, [x4, #(5 * 8)]
- ldr x10, [x4, #(4 * 8)]
- ldr x9, [x4, #(3 * 8)]
- ldr x8, [x4, #(2 * 8)]
- ldr x7, [x4, #(1 * 8)]
- ldr x6, [x4, #(0 * 8)]
-
- adr x22, 1f
- add x22, x22, x5, lsl #2
- br x22
-1:
- msr \type\()15_el1, x21
- msr \type\()14_el1, x20
- msr \type\()13_el1, x19
- msr \type\()12_el1, x18
- msr \type\()11_el1, x17
- msr \type\()10_el1, x16
- msr \type\()9_el1, x15
- msr \type\()8_el1, x14
- msr \type\()7_el1, x13
- msr \type\()6_el1, x12
- msr \type\()5_el1, x11
- msr \type\()4_el1, x10
- msr \type\()3_el1, x9
- msr \type\()2_el1, x8
- msr \type\()1_el1, x7
- msr \type\()0_el1, x6
-.endm
-
-.macro skip_32bit_state tmp, target
- // Skip 32bit state if not needed
- mrs \tmp, hcr_el2
- tbnz \tmp, #HCR_RW_SHIFT, \target
-.endm
-
-.macro skip_tee_state tmp, target
- // Skip ThumbEE state if not needed
- mrs \tmp, id_pfr0_el1
- tbz \tmp, #12, \target
-.endm
-
-.macro skip_debug_state tmp, target
- ldr \tmp, [x0, #VCPU_DEBUG_FLAGS]
- tbz \tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
-.endm
-
-/*
- * Branch to target if CPTR_EL2.TFP bit is set (VFP/SIMD trapping enabled)
- */
-.macro skip_fpsimd_state tmp, target
- mrs \tmp, cptr_el2
- tbnz \tmp, #CPTR_EL2_TFP_SHIFT, \target
-.endm
-
-.macro compute_debug_state target
- // Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
- // is set, we do a full save/restore cycle and disable trapping.
- add x25, x0, #VCPU_CONTEXT
-
- // Check the state of MDSCR_EL1
- ldr x25, [x25, #CPU_SYSREG_OFFSET(MDSCR_EL1)]
- and x26, x25, #DBG_MDSCR_KDE
- and x25, x25, #DBG_MDSCR_MDE
- adds xzr, x25, x26
- b.eq 9998f // Nothing to see there
-
- // If any interesting bits was set, we must set the flag
- mov x26, #KVM_ARM64_DEBUG_DIRTY
- str x26, [x0, #VCPU_DEBUG_FLAGS]
- b 9999f // Don't skip restore
-
-9998:
- // Otherwise load the flags from memory in case we recently
- // trapped
- skip_debug_state x25, \target
-9999:
-.endm
-
-.macro save_guest_32bit_state
- skip_32bit_state x3, 1f
-
- add x3, x2, #CPU_SPSR_OFFSET(KVM_SPSR_ABT)
- mrs x4, spsr_abt
- mrs x5, spsr_und
- mrs x6, spsr_irq
- mrs x7, spsr_fiq
- stp x4, x5, [x3]
- stp x6, x7, [x3, #16]
-
- add x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
- mrs x4, dacr32_el2
- mrs x5, ifsr32_el2
- stp x4, x5, [x3]
-
- skip_fpsimd_state x8, 2f
- mrs x6, fpexc32_el2
- str x6, [x3, #16]
-2:
- skip_debug_state x8, 1f
- mrs x7, dbgvcr32_el2
- str x7, [x3, #24]
-1:
-.endm
-
-.macro restore_guest_32bit_state
- skip_32bit_state x3, 1f
-
- add x3, x2, #CPU_SPSR_OFFSET(KVM_SPSR_ABT)
- ldp x4, x5, [x3]
- ldp x6, x7, [x3, #16]
- msr spsr_abt, x4
- msr spsr_und, x5
- msr spsr_irq, x6
- msr spsr_fiq, x7
-
- add x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
- ldp x4, x5, [x3]
- msr dacr32_el2, x4
- msr ifsr32_el2, x5
-
- skip_debug_state x8, 1f
- ldr x7, [x3, #24]
- msr dbgvcr32_el2, x7
-1:
-.endm
-
-.macro activate_traps
- ldr x2, [x0, #VCPU_HCR_EL2]
-
- /*
- * We are about to set CPTR_EL2.TFP to trap all floating point
- * register accesses to EL2, however, the ARM ARM clearly states that
- * traps are only taken to EL2 if the operation would not otherwise
- * trap to EL1. Therefore, always make sure that for 32-bit guests,
- * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
- */
- tbnz x2, #HCR_RW_SHIFT, 99f // open code skip_32bit_state
- mov x3, #(1 << 30)
- msr fpexc32_el2, x3
- isb
-99:
- msr hcr_el2, x2
- mov x2, #CPTR_EL2_TTA
- orr x2, x2, #CPTR_EL2_TFP
- msr cptr_el2, x2
-
- mov x2, #(1 << 15) // Trap CP15 Cr=15
- msr hstr_el2, x2
-
- // Monitor Debug Config - see kvm_arm_setup_debug()
- ldr x2, [x0, #VCPU_MDCR_EL2]
- msr mdcr_el2, x2
-.endm
-
-.macro deactivate_traps
- mov x2, #HCR_RW
- msr hcr_el2, x2
- msr hstr_el2, xzr
-
- mrs x2, mdcr_el2
- and x2, x2, #MDCR_EL2_HPMN_MASK
- msr mdcr_el2, x2
-.endm
-
-.macro activate_vm
- ldr x1, [x0, #VCPU_KVM]
- kern_hyp_va x1
- ldr x2, [x1, #KVM_VTTBR]
- msr vttbr_el2, x2
-.endm
-
-.macro deactivate_vm
- msr vttbr_el2, xzr
-.endm
-
-/*
- * Call into the vgic backend for state saving
- */
-.macro save_vgic_state
-alternative_if_not ARM64_HAS_SYSREG_GIC_CPUIF
- bl __save_vgic_v2_state
-alternative_else
- bl __save_vgic_v3_state
-alternative_endif
- mrs x24, hcr_el2
- mov x25, #HCR_INT_OVERRIDE
- neg x25, x25
- and x24, x24, x25
- msr hcr_el2, x24
-.endm
-
-/*
- * Call into the vgic backend for state restoring
- */
-.macro restore_vgic_state
- mrs x24, hcr_el2
- ldr x25, [x0, #VCPU_IRQ_LINES]
- orr x24, x24, #HCR_INT_OVERRIDE
- orr x24, x24, x25
- msr hcr_el2, x24
-alternative_if_not ARM64_HAS_SYSREG_GIC_CPUIF
- bl __restore_vgic_v2_state
-alternative_else
- bl __restore_vgic_v3_state
-alternative_endif
-.endm
-
-.macro save_timer_state
- // x0: vcpu pointer
- ldr x2, [x0, #VCPU_KVM]
- kern_hyp_va x2
- ldr w3, [x2, #KVM_TIMER_ENABLED]
- cbz w3, 1f
-
- mrs x3, cntv_ctl_el0
- and x3, x3, #3
- str w3, [x0, #VCPU_TIMER_CNTV_CTL]
-
- isb
-
- mrs x3, cntv_cval_el0
- str x3, [x0, #VCPU_TIMER_CNTV_CVAL]
-
-1:
- // Disable the virtual timer
- msr cntv_ctl_el0, xzr
-
- // Allow physical timer/counter access for the host
- mrs x2, cnthctl_el2
- orr x2, x2, #3
- msr cnthctl_el2, x2
-
- // Clear cntvoff for the host
- msr cntvoff_el2, xzr
-.endm
-
-.macro restore_timer_state
- // x0: vcpu pointer
- // Disallow physical timer access for the guest
- // Physical counter access is allowed
- mrs x2, cnthctl_el2
- orr x2, x2, #1
- bic x2, x2, #2
- msr cnthctl_el2, x2
-
- ldr x2, [x0, #VCPU_KVM]
- kern_hyp_va x2
- ldr w3, [x2, #KVM_TIMER_ENABLED]
- cbz w3, 1f
-
- ldr x3, [x2, #KVM_TIMER_CNTVOFF]
- msr cntvoff_el2, x3
- ldr x2, [x0, #VCPU_TIMER_CNTV_CVAL]
- msr cntv_cval_el0, x2
- isb
-
- ldr w2, [x0, #VCPU_TIMER_CNTV_CTL]
- and x2, x2, #3
- msr cntv_ctl_el0, x2
-1:
-.endm
-
-__save_sysregs:
- save_sysregs
- ret
-
-__restore_sysregs:
- restore_sysregs
- ret
-
-/* Save debug state */
-__save_debug:
- // x2: ptr to CPU context
- // x3: ptr to debug reg struct
- // x4/x5/x6-22/x24-26: trashed
-
- mrs x26, id_aa64dfr0_el1
- ubfx x24, x26, #12, #4 // Extract BRPs
- ubfx x25, x26, #20, #4 // Extract WRPs
- mov w26, #15
- sub w24, w26, w24 // How many BPs to skip
- sub w25, w26, w25 // How many WPs to skip
-
- mov x5, x24
- add x4, x3, #DEBUG_BCR
- save_debug dbgbcr
- add x4, x3, #DEBUG_BVR
- save_debug dbgbvr
-
- mov x5, x25
- add x4, x3, #DEBUG_WCR
- save_debug dbgwcr
- add x4, x3, #DEBUG_WVR
- save_debug dbgwvr
-
- mrs x21, mdccint_el1
- str x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
- ret
-
-/* Restore debug state */
-__restore_debug:
- // x2: ptr to CPU context
- // x3: ptr to debug reg struct
- // x4/x5/x6-22/x24-26: trashed
-
- mrs x26, id_aa64dfr0_el1
- ubfx x24, x26, #12, #4 // Extract BRPs
- ubfx x25, x26, #20, #4 // Extract WRPs
- mov w26, #15
- sub w24, w26, w24 // How many BPs to skip
- sub w25, w26, w25 // How many WPs to skip
-
- mov x5, x24
- add x4, x3, #DEBUG_BCR
- restore_debug dbgbcr
- add x4, x3, #DEBUG_BVR
- restore_debug dbgbvr
-
- mov x5, x25
- add x4, x3, #DEBUG_WCR
- restore_debug dbgwcr
- add x4, x3, #DEBUG_WVR
- restore_debug dbgwvr
-
- ldr x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
- msr mdccint_el1, x21
-
- ret
-
-__save_fpsimd:
- skip_fpsimd_state x3, 1f
- save_fpsimd
-1: ret
-
-__restore_fpsimd:
- skip_fpsimd_state x3, 1f
- restore_fpsimd
-1: ret
-
-switch_to_guest_fpsimd:
- push x4, lr
-
- mrs x2, cptr_el2
- bic x2, x2, #CPTR_EL2_TFP
- msr cptr_el2, x2
- isb
-
- mrs x0, tpidr_el2
-
- ldr x2, [x0, #VCPU_HOST_CONTEXT]
- kern_hyp_va x2
- bl __save_fpsimd
-
- add x2, x0, #VCPU_CONTEXT
- bl __restore_fpsimd
-
- skip_32bit_state x3, 1f
- ldr x4, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
- msr fpexc32_el2, x4
-1:
- pop x4, lr
- pop x2, x3
- pop x0, x1
-
- eret
-
-/*
- * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
- *
- * This is the world switch. The first half of the function
- * deals with entering the guest, and anything from __kvm_vcpu_return
- * to the end of the function deals with reentering the host.
- * On the enter path, only x0 (vcpu pointer) must be preserved until
- * the last moment. On the exit path, x0 (vcpu pointer) and x1 (exception
- * code) must both be preserved until the epilogue.
- * In both cases, x2 points to the CPU context we're saving/restoring from/to.
- */
-ENTRY(__kvm_vcpu_run)
- kern_hyp_va x0
- msr tpidr_el2, x0 // Save the vcpu register
-
- // Host context
- ldr x2, [x0, #VCPU_HOST_CONTEXT]
- kern_hyp_va x2
-
- save_host_regs
- bl __save_sysregs
-
- compute_debug_state 1f
- add x3, x0, #VCPU_HOST_DEBUG_STATE
- bl __save_debug
-1:
- activate_traps
- activate_vm
-
- restore_vgic_state
- restore_timer_state
-
- // Guest context
- add x2, x0, #VCPU_CONTEXT
-
- // We must restore the 32-bit state before the sysregs, thanks
- // to Cortex-A57 erratum #852523.
- restore_guest_32bit_state
- bl __restore_sysregs
-
- skip_debug_state x3, 1f
- ldr x3, [x0, #VCPU_DEBUG_PTR]
- kern_hyp_va x3
- bl __restore_debug
-1:
- restore_guest_regs
-
- // That's it, no more messing around.
- eret
-
-__kvm_vcpu_return:
- // Assume x0 is the vcpu pointer, x1 the return code
- // Guest's x0-x3 are on the stack
-
- // Guest context
- add x2, x0, #VCPU_CONTEXT
-
- save_guest_regs
- bl __save_fpsimd
- bl __save_sysregs
-
- skip_debug_state x3, 1f
- ldr x3, [x0, #VCPU_DEBUG_PTR]
- kern_hyp_va x3
- bl __save_debug
-1:
- save_guest_32bit_state
-
- save_timer_state
- save_vgic_state
-
- deactivate_traps
- deactivate_vm
-
- // Host context
- ldr x2, [x0, #VCPU_HOST_CONTEXT]
- kern_hyp_va x2
-
- bl __restore_sysregs
- bl __restore_fpsimd
- /* Clear FPSIMD and Trace trapping */
- msr cptr_el2, xzr
-
- skip_debug_state x3, 1f
- // Clear the dirty flag for the next run, as all the state has
- // already been saved. Note that we nuke the whole 64bit word.
- // If we ever add more flags, we'll have to be more careful...
- str xzr, [x0, #VCPU_DEBUG_FLAGS]
- add x3, x0, #VCPU_HOST_DEBUG_STATE
- bl __restore_debug
-1:
- restore_host_regs
-
- mov x0, x1
- ret
-END(__kvm_vcpu_run)
-
-// void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
-ENTRY(__kvm_tlb_flush_vmid_ipa)
- dsb ishst
-
- kern_hyp_va x0
- ldr x2, [x0, #KVM_VTTBR]
- msr vttbr_el2, x2
- isb
-
- /*
- * We could do so much better if we had the VA as well.
- * Instead, we invalidate Stage-2 for this IPA, and the
- * whole of Stage-1. Weep...
- */
- lsr x1, x1, #12
- tlbi ipas2e1is, x1
- /*
- * We have to ensure completion of the invalidation at Stage-2,
- * since a table walk on another CPU could refill a TLB with a
- * complete (S1 + S2) walk based on the old Stage-2 mapping if
- * the Stage-1 invalidation happened first.
- */
- dsb ish
- tlbi vmalle1is
- dsb ish
- isb
-
- msr vttbr_el2, xzr
- ret
-ENDPROC(__kvm_tlb_flush_vmid_ipa)
-
-/**
- * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
- * @struct kvm *kvm - pointer to kvm structure
- *
- * Invalidates all Stage 1 and 2 TLB entries for current VMID.
- */
-ENTRY(__kvm_tlb_flush_vmid)
- dsb ishst
-
- kern_hyp_va x0
- ldr x2, [x0, #KVM_VTTBR]
- msr vttbr_el2, x2
- isb
-
- tlbi vmalls12e1is
- dsb ish
- isb
-
- msr vttbr_el2, xzr
- ret
-ENDPROC(__kvm_tlb_flush_vmid)
-
-ENTRY(__kvm_flush_vm_context)
- dsb ishst
- tlbi alle1is
- ic ialluis
- dsb ish
- ret
-ENDPROC(__kvm_flush_vm_context)
-
-__kvm_hyp_panic:
- // Stash PAR_EL1 before corrupting it in __restore_sysregs
- mrs x0, par_el1
- push x0, xzr
-
- // Guess the context by looking at VTTBR:
- // If zero, then we're already a host.
- // Otherwise restore a minimal host context before panicing.
- mrs x0, vttbr_el2
- cbz x0, 1f
-
- mrs x0, tpidr_el2
-
- deactivate_traps
- deactivate_vm
-
- ldr x2, [x0, #VCPU_HOST_CONTEXT]
- kern_hyp_va x2
-
- bl __restore_sysregs
-
- /*
- * Make sure we have a valid host stack, and don't leave junk in the
- * frame pointer that will give us a misleading host stack unwinding.
- */
- ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
- msr sp_el1, x22
- mov x29, xzr
-
-1: adr x0, __hyp_panic_str
- adr x1, 2f
- ldp x2, x3, [x1]
- sub x0, x0, x2
- add x0, x0, x3
- mrs x1, spsr_el2
- mrs x2, elr_el2
- mrs x3, esr_el2
- mrs x4, far_el2
- mrs x5, hpfar_el2
- pop x6, xzr // active context PAR_EL1
- mrs x7, tpidr_el2
-
- mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
- PSR_MODE_EL1h)
- msr spsr_el2, lr
- ldr lr, =panic
- msr elr_el2, lr
- eret
-
- .align 3
-2: .quad HYP_PAGE_OFFSET
- .quad PAGE_OFFSET
-ENDPROC(__kvm_hyp_panic)
-
-__hyp_panic_str:
- .ascii "HYP panic:\nPS:%08x PC:%016x ESR:%08x\nFAR:%016x HPFAR:%016x PAR:%016x\nVCPU:%p\n\0"
-
- .align 2
/*
- * u64 kvm_call_hyp(void *hypfn, ...);
+ * u64 __kvm_call_hyp(void *hypfn, ...);
*
* This is not really a variadic function in the classic C-way and care must
* be taken when calling this to ensure parameters are passed in registers
@@ -934,189 +33,23 @@ __hyp_panic_str:
* passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
* function pointer can be passed). The function being called must be mapped
* in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are
- * passed in r0 and r1.
+ * passed in x0.
*
- * A function pointer with a value of 0 has a special meaning, and is
- * used to implement __hyp_get_vectors in the same way as in
+ * A function pointer with a value less than 0xfff has a special meaning,
+ * and is used to implement __hyp_get_vectors in the same way as in
* arch/arm64/kernel/hyp_stub.S.
+ * HVC behaves as a 'bl' call and will clobber lr.
*/
-ENTRY(kvm_call_hyp)
+ENTRY(__kvm_call_hyp)
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+ str lr, [sp, #-16]!
hvc #0
+ ldr lr, [sp], #16
ret
-ENDPROC(kvm_call_hyp)
-
-.macro invalid_vector label, target
- .align 2
-\label:
- b \target
-ENDPROC(\label)
-.endm
-
- /* None of these should ever happen */
- invalid_vector el2t_sync_invalid, __kvm_hyp_panic
- invalid_vector el2t_irq_invalid, __kvm_hyp_panic
- invalid_vector el2t_fiq_invalid, __kvm_hyp_panic
- invalid_vector el2t_error_invalid, __kvm_hyp_panic
- invalid_vector el2h_sync_invalid, __kvm_hyp_panic
- invalid_vector el2h_irq_invalid, __kvm_hyp_panic
- invalid_vector el2h_fiq_invalid, __kvm_hyp_panic
- invalid_vector el2h_error_invalid, __kvm_hyp_panic
- invalid_vector el1_sync_invalid, __kvm_hyp_panic
- invalid_vector el1_irq_invalid, __kvm_hyp_panic
- invalid_vector el1_fiq_invalid, __kvm_hyp_panic
- invalid_vector el1_error_invalid, __kvm_hyp_panic
-
-el1_sync: // Guest trapped into EL2
- push x0, x1
- push x2, x3
-
- mrs x1, esr_el2
- lsr x2, x1, #ESR_ELx_EC_SHIFT
-
- cmp x2, #ESR_ELx_EC_HVC64
- b.ne el1_trap
-
- mrs x3, vttbr_el2 // If vttbr is valid, the 64bit guest
- cbnz x3, el1_trap // called HVC
-
- /* Here, we're pretty sure the host called HVC. */
- pop x2, x3
- pop x0, x1
-
- /* Check for __hyp_get_vectors */
- cbnz x0, 1f
- mrs x0, vbar_el2
- b 2f
-
-1: push lr, xzr
-
- /*
- * Compute the function address in EL2, and shuffle the parameters.
- */
- kern_hyp_va x0
- mov lr, x0
- mov x0, x1
- mov x1, x2
- mov x2, x3
- blr lr
-
- pop lr, xzr
-2: eret
-
-el1_trap:
- /*
- * x1: ESR
- * x2: ESR_EC
- */
-
- /* Guest accessed VFP/SIMD registers, save host, restore Guest */
- cmp x2, #ESR_ELx_EC_FP_ASIMD
- b.eq switch_to_guest_fpsimd
-
- cmp x2, #ESR_ELx_EC_DABT_LOW
- mov x0, #ESR_ELx_EC_IABT_LOW
- ccmp x2, x0, #4, ne
- b.ne 1f // Not an abort we care about
-
- /* This is an abort. Check for permission fault */
-alternative_if_not ARM64_WORKAROUND_834220
- and x2, x1, #ESR_ELx_FSC_TYPE
- cmp x2, #FSC_PERM
- b.ne 1f // Not a permission fault
alternative_else
- nop // Use the permission fault path to
- nop // check for a valid S1 translation,
- nop // regardless of the ESR value.
+ b __vhe_hyp_call
+ nop
+ nop
+ nop
alternative_endif
-
- /*
- * Check for Stage-1 page table walk, which is guaranteed
- * to give a valid HPFAR_EL2.
- */
- tbnz x1, #7, 1f // S1PTW is set
-
- /* Preserve PAR_EL1 */
- mrs x3, par_el1
- push x3, xzr
-
- /*
- * Permission fault, HPFAR_EL2 is invalid.
- * Resolve the IPA the hard way using the guest VA.
- * Stage-1 translation already validated the memory access rights.
- * As such, we can use the EL1 translation regime, and don't have
- * to distinguish between EL0 and EL1 access.
- */
- mrs x2, far_el2
- at s1e1r, x2
- isb
-
- /* Read result */
- mrs x3, par_el1
- pop x0, xzr // Restore PAR_EL1 from the stack
- msr par_el1, x0
- tbnz x3, #0, 3f // Bail out if we failed the translation
- ubfx x3, x3, #12, #36 // Extract IPA
- lsl x3, x3, #4 // and present it like HPFAR
- b 2f
-
-1: mrs x3, hpfar_el2
- mrs x2, far_el2
-
-2: mrs x0, tpidr_el2
- str w1, [x0, #VCPU_ESR_EL2]
- str x2, [x0, #VCPU_FAR_EL2]
- str x3, [x0, #VCPU_HPFAR_EL2]
-
- mov x1, #ARM_EXCEPTION_TRAP
- b __kvm_vcpu_return
-
- /*
- * Translation failed. Just return to the guest and
- * let it fault again. Another CPU is probably playing
- * behind our back.
- */
-3: pop x2, x3
- pop x0, x1
-
- eret
-
-el1_irq:
- push x0, x1
- push x2, x3
- mrs x0, tpidr_el2
- mov x1, #ARM_EXCEPTION_IRQ
- b __kvm_vcpu_return
-
- .ltorg
-
- .align 11
-
-ENTRY(__kvm_hyp_vector)
- ventry el2t_sync_invalid // Synchronous EL2t
- ventry el2t_irq_invalid // IRQ EL2t
- ventry el2t_fiq_invalid // FIQ EL2t
- ventry el2t_error_invalid // Error EL2t
-
- ventry el2h_sync_invalid // Synchronous EL2h
- ventry el2h_irq_invalid // IRQ EL2h
- ventry el2h_fiq_invalid // FIQ EL2h
- ventry el2h_error_invalid // Error EL2h
-
- ventry el1_sync // Synchronous 64-bit EL1
- ventry el1_irq // IRQ 64-bit EL1
- ventry el1_fiq_invalid // FIQ 64-bit EL1
- ventry el1_error_invalid // Error 64-bit EL1
-
- ventry el1_sync // Synchronous 32-bit EL1
- ventry el1_irq // IRQ 32-bit EL1
- ventry el1_fiq_invalid // FIQ 32-bit EL1
- ventry el1_error_invalid // Error 32-bit EL1
-ENDPROC(__kvm_hyp_vector)
-
-
-ENTRY(__kvm_get_mdcr_el2)
- mrs x0, mdcr_el2
- ret
-ENDPROC(__kvm_get_mdcr_el2)
-
- .popsection
+ENDPROC(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
new file mode 100644
index 000000000000..826032bc3945
--- /dev/null
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for Kernel-based Virtual Machine module, HYP part
+#
+
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += entry.o
+obj-$(CONFIG_KVM_ARM_HOST) += switch.o
+obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
+obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
+obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
new file mode 100644
index 000000000000..2f8bca8af295
--- /dev/null
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+#include "hyp.h"
+
+#define read_debug(r,n) read_sysreg(r##n##_el1)
+#define write_debug(v,r,n) write_sysreg(v, r##n##_el1)
+
+#define save_debug(ptr,reg,nr) \
+ switch (nr) { \
+ case 15: ptr[15] = read_debug(reg, 15); \
+ case 14: ptr[14] = read_debug(reg, 14); \
+ case 13: ptr[13] = read_debug(reg, 13); \
+ case 12: ptr[12] = read_debug(reg, 12); \
+ case 11: ptr[11] = read_debug(reg, 11); \
+ case 10: ptr[10] = read_debug(reg, 10); \
+ case 9: ptr[9] = read_debug(reg, 9); \
+ case 8: ptr[8] = read_debug(reg, 8); \
+ case 7: ptr[7] = read_debug(reg, 7